diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h index 7e013dbf2ab38..863f1f585ef99 100644 --- a/llvm/include/llvm/CodeGen/SlotIndexes.h +++ b/llvm/include/llvm/CodeGen/SlotIndexes.h @@ -640,6 +640,9 @@ class raw_ostream; renumberIndexes(newItr); llvm::sort(idx2MBBMap, less_first()); } + + /// Renumber all indexes using the default instruction distance. + void packIndexes(); }; // Specialize IntervalMapInfo for half-open slot index intervals. diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index f97cb1a0fb722..36625c4848c53 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2692,6 +2692,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { return false; Indexes = &getAnalysis(); + // Renumber to get accurate and consistent results from + // SlotIndexes::getApproxInstrDistance. + Indexes->packIndexes(); MBFI = &getAnalysis(); DomTree = &getAnalysis(); ORE = &getAnalysis().getORE(); diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp index 47ee36971d0ea..65726f06dedb4 100644 --- a/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/llvm/lib/CodeGen/SlotIndexes.cpp @@ -237,6 +237,11 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, } } +void SlotIndexes::packIndexes() { + for (auto [Index, Entry] : enumerate(indexList)) + Entry.setIndex(Index * SlotIndex::InstrDist); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SlotIndexes::dump() const { for (const IndexListEntry &ILE : indexList) { diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index e8437b5cd801f..a65c5d6667794 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -221,69 +221,69 @@ define @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) { ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z0.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z7.d, x1 +; CHECK-NEXT: mov z3.d, x1 ; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: uqadd z5.d, z1.d, z0.d +; CHECK-NEXT: uqadd z25.d, z1.d, z0.d ; CHECK-NEXT: incd z1.d, all, mul #8 ; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #2 ; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z7.d, z5.d +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z25.d ; CHECK-NEXT: uqadd z1.d, z1.d, z0.d -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: uqadd z24.d, z2.d, z0.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z27.d, z3.d -; CHECK-NEXT: uqadd z26.d, z3.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z24.d, z4.d +; CHECK-NEXT: uqadd z27.d, z4.d, z0.d ; CHECK-NEXT: uqadd z28.d, z6.d, z0.d ; CHECK-NEXT: incd z2.d, all, mul #8 -; CHECK-NEXT: incd z3.d, all, mul #8 -; CHECK-NEXT: incd z6.d, all, mul #8 -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z25.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z24.d -; CHECK-NEXT: incd z27.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z7.d, z26.d -; CHECK-NEXT: cmphi p5.d, p0/z, z7.d, z28.d -; CHECK-NEXT: uqadd z2.d, z2.d, z0.d -; CHECK-NEXT: uqadd z3.d, z3.d, z0.d -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: uqadd z5.d, z4.d, z0.d -; CHECK-NEXT: uqadd z26.d, z25.d, z0.d ; CHECK-NEXT: incd z4.d, all, mul #8 -; CHECK-NEXT: incd z25.d, all, mul #8 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: incd z6.d, all, mul #8 +; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z26.d ; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z5.d -; CHECK-NEXT: uqadd z5.d, z27.d, z0.d -; CHECK-NEXT: incd z27.d, all, mul #8 +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z27.d +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z28.d +; CHECK-NEXT: uqadd z2.d, z2.d, z0.d ; CHECK-NEXT: uqadd z4.d, z4.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z26.d -; CHECK-NEXT: uqadd z28.d, z24.d, z0.d +; CHECK-NEXT: uqadd z6.d, z6.d, z0.d +; CHECK-NEXT: mov z26.d, z5.d +; CHECK-NEXT: uqadd z25.d, z5.d, z0.d +; CHECK-NEXT: uqadd z27.d, z7.d, z0.d +; CHECK-NEXT: incd z5.d, all, mul #8 +; CHECK-NEXT: incd z7.d, all, mul #8 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: incd z26.d, all, mul #4 +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z25.d +; CHECK-NEXT: uqadd z25.d, z24.d, z0.d ; CHECK-NEXT: incd z24.d, all, mul #8 +; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: uqadd z7.d, z7.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z27.d +; CHECK-NEXT: uqadd z28.d, z26.d, z0.d +; CHECK-NEXT: incd z26.d, all, mul #8 ; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d -; CHECK-NEXT: uqadd z5.d, z6.d, z0.d -; CHECK-NEXT: uqadd z6.d, z25.d, z0.d -; CHECK-NEXT: uqadd z25.d, z27.d, z0.d -; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z1.d +; CHECK-NEXT: uqadd z24.d, z24.d, z0.d +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z25.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z1.d ; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s -; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z3.d -; CHECK-NEXT: cmphi p9.d, p0/z, z7.d, z4.d -; CHECK-NEXT: uqadd z0.d, z24.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z28.d -; CHECK-NEXT: cmphi p10.d, p0/z, z7.d, z6.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z5.d +; CHECK-NEXT: cmphi p10.d, p0/z, z3.d, z7.d +; CHECK-NEXT: uqadd z0.d, z26.d, z0.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z28.d ; CHECK-NEXT: uzp1 p4.s, p4.s, p8.s -; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z25.d +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z24.d ; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s -; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d -; CHECK-NEXT: cmphi p0.d, p0/z, z7.d, z0.d ; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z6.d +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z0.d ; CHECK-NEXT: uzp1 p7.s, p7.s, p10.s ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: uzp1 p0.s, p8.s, p0.s diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll index 091fb7f0c730a..65cc368c0b561 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -10,28 +10,28 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK-LABEL: fullGtU: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: adrp x8, _block@GOTPAGE +; CHECK-NEXT: adrp x9, _block@GOTPAGE ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ; kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: sxtw x8, w0 ; CHECK-NEXT: sxtw x10, w1 -; CHECK-NEXT: ldr x8, [x8, _block@GOTPAGEOFF] -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: ldrb w11, [x8, x9] -; CHECK-NEXT: ldrb w12, [x8, x10] +; CHECK-NEXT: ldr x9, [x9, _block@GOTPAGEOFF] +; CHECK-NEXT: ldr x9, [x9] +; CHECK-NEXT: ldrb w11, [x9, x8] +; CHECK-NEXT: ldrb w12, [x9, x10] ; CHECK-NEXT: cmp w11, w12 ; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %if.end -; CHECK-NEXT: add x9, x9, x8 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ldrb w10, [x9, #1] -; CHECK-NEXT: ldrb w11, [x8, #1] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: add x9, x10, x9 +; CHECK-NEXT: ldrb w10, [x8, #1] +; CHECK-NEXT: ldrb w11, [x9, #1] ; CHECK-NEXT: cmp w10, w11 ; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.2: ; %if.end25 -; CHECK-NEXT: ldrb w9, [x9, #2] ; CHECK-NEXT: ldrb w8, [x8, #2] -; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: ldrb w9, [x9, #2] +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi ; CHECK-NEXT: csel w0, wzr, w8, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-cse.ll b/llvm/test/CodeGen/AArch64/arm64-cse.ll index 7afa30970dff2..3cacc25e02b21 100644 --- a/llvm/test/CodeGen/AArch64/arm64-cse.ll +++ b/llvm/test/CodeGen/AArch64/arm64-cse.ll @@ -8,16 +8,16 @@ target triple = "arm64-apple-ios" define ptr @t1(ptr %base, ptr nocapture %offset, i32 %size) nounwind { ; CHECK-LABEL: t1: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: subs w8, w9, w2 +; CHECK-NEXT: ldr w8, [x1] +; CHECK-NEXT: subs w9, w8, w2 ; CHECK-NEXT: b.ge LBB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %if.end -; CHECK-NEXT: add x0, x0, w8, sxtw -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: str w9, [x1] +; CHECK-NEXT: add x0, x0, w9, sxtw +; CHECK-NEXT: sub w8, w8, w9 +; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %offset, align 4 diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll index 80b8c963a697c..5806bcf0dacf1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -1059,15 +1059,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: .cfi_offset w27, -88 ; ENABLE-NEXT: .cfi_offset w28, -96 ; ENABLE-NEXT: lsl w8, w1, w0 -; ENABLE-NEXT: lsr w10, w0, w1 -; ENABLE-NEXT: lsl w16, w0, w1 +; ENABLE-NEXT: lsr w9, w0, w1 +; ENABLE-NEXT: lsl w14, w0, w1 ; ENABLE-NEXT: lsr w11, w1, w0 -; ENABLE-NEXT: add w14, w1, w0 -; ENABLE-NEXT: sub w9, w8, w10 +; ENABLE-NEXT: add w15, w1, w0 +; ENABLE-NEXT: sub w10, w8, w9 ; ENABLE-NEXT: subs w17, w1, w0 -; ENABLE-NEXT: add w15, w16, w8 -; ENABLE-NEXT: add w12, w10, w11 -; ENABLE-NEXT: add w13, w11, w14 +; ENABLE-NEXT: add w16, w14, w8 +; ENABLE-NEXT: add w12, w9, w11 +; ENABLE-NEXT: add w13, w11, w15 ; ENABLE-NEXT: b.le LBB14_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] @@ -1075,14 +1075,14 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: nop ; ENABLE-NEXT: ; InlineAsm End ; ENABLE-NEXT: LBB14_2: ; %false -; ENABLE-NEXT: str w16, [x2] +; ENABLE-NEXT: str w14, [x2] ; ENABLE-NEXT: str w8, [x3] -; ENABLE-NEXT: str w10, [x4] +; ENABLE-NEXT: str w9, [x4] ; ENABLE-NEXT: str w11, [x5] -; ENABLE-NEXT: str w14, [x6] +; ENABLE-NEXT: str w15, [x6] ; ENABLE-NEXT: str w17, [x7] ; ENABLE-NEXT: stp w0, w1, [x2, #4] -; ENABLE-NEXT: stp w15, w9, [x2, #12] +; ENABLE-NEXT: stp w16, w10, [x2, #12] ; ENABLE-NEXT: stp w12, w13, [x2, #20] ; ENABLE-NEXT: sub sp, x29, #80 ; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload @@ -1118,15 +1118,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: .cfi_offset w27, -88 ; DISABLE-NEXT: .cfi_offset w28, -96 ; DISABLE-NEXT: lsl w8, w1, w0 -; DISABLE-NEXT: lsr w10, w0, w1 -; DISABLE-NEXT: lsl w16, w0, w1 +; DISABLE-NEXT: lsr w9, w0, w1 +; DISABLE-NEXT: lsl w14, w0, w1 ; DISABLE-NEXT: lsr w11, w1, w0 -; DISABLE-NEXT: add w14, w1, w0 -; DISABLE-NEXT: sub w9, w8, w10 +; DISABLE-NEXT: add w15, w1, w0 +; DISABLE-NEXT: sub w10, w8, w9 ; DISABLE-NEXT: subs w17, w1, w0 -; DISABLE-NEXT: add w15, w16, w8 -; DISABLE-NEXT: add w12, w10, w11 -; DISABLE-NEXT: add w13, w11, w14 +; DISABLE-NEXT: add w16, w14, w8 +; DISABLE-NEXT: add w12, w9, w11 +; DISABLE-NEXT: add w13, w11, w15 ; DISABLE-NEXT: b.le LBB14_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] @@ -1134,14 +1134,14 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: nop ; DISABLE-NEXT: ; InlineAsm End ; DISABLE-NEXT: LBB14_2: ; %false -; DISABLE-NEXT: str w16, [x2] +; DISABLE-NEXT: str w14, [x2] ; DISABLE-NEXT: str w8, [x3] -; DISABLE-NEXT: str w10, [x4] +; DISABLE-NEXT: str w9, [x4] ; DISABLE-NEXT: str w11, [x5] -; DISABLE-NEXT: str w14, [x6] +; DISABLE-NEXT: str w15, [x6] ; DISABLE-NEXT: str w17, [x7] ; DISABLE-NEXT: stp w0, w1, [x2, #4] -; DISABLE-NEXT: stp w15, w9, [x2, #12] +; DISABLE-NEXT: stp w16, w10, [x2, #12] ; DISABLE-NEXT: stp w12, w13, [x2, #20] ; DISABLE-NEXT: sub sp, x29, #80 ; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index f00265a80e032..01fd2b1113b00 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -26,25 +26,25 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d +; CHECK-NEXT: zip2 p3.d, p1.d, p1.d ; CHECK-NEXT: add x13, x0, x8 ; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p3.d, p1.d, p1.d +; CHECK-NEXT: zip1 p2.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: whilelo p1.d, x12, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: add x12, x12, x10 -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p3/m, z6.d +; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -237,19 +237,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: add x8, x8, x12 ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: zip1 p3.d, p1.d, p1.d +; CHECK-NEXT: zip2 p3.d, p1.d, p1.d +; CHECK-NEXT: zip1 p2.d, p1.d, p1.d ; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p3/m, z6.d +; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index 40fd7a392c83b..44d0a9392ba62 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll @@ -144,10 +144,10 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -159,24 +159,24 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q7, q6, [x10] ; CHECK-NEXT: ldp q17, q16, [x9, #32] ; CHECK-NEXT: ldp q19, q18, [x10, #32] -; CHECK-NEXT: fcmla v1.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 -; CHECK-NEXT: fcmla v2.2d, v19.2d, v17.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v7.2d, v5.2d, #90 +; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90 -; CHECK-NEXT: fcmla v2.2d, v19.2d, v17.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v3.2d, v1.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d +; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d +; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d ; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d -; CHECK-NEXT: fadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: ret entry: %scevgep = getelementptr i8, ptr %a, i64 32 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 2cbc8ed3192de..e8d9ec7dc85de 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -217,7 +217,7 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s17, [sp, #96] +; CHECK-NEXT: ldr s17, [sp, #104] ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 @@ -225,21 +225,21 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: ldr s20, [sp, #192] ; CHECK-NEXT: mov v1.s[2], v5.s[0] ; CHECK-NEXT: ld1 { v16.s }[2], [x10] -; CHECK-NEXT: ldr s5, [sp, #104] +; CHECK-NEXT: ldr s5, [sp, #96] ; CHECK-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-NEXT: add x9, sp, #24 ; CHECK-NEXT: add x10, sp, #112 ; CHECK-NEXT: ld1 { v18.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x10] ; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: ld1 { v16.s }[3], [x9] ; CHECK-NEXT: mov v1.s[3], v7.s[0] ; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ldr s4, [sp, #128] ; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v17.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: ldr s7, [sp] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] @@ -247,8 +247,8 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: add x10, sp, #16 ; CHECK-NEXT: add x9, sp, #160 ; CHECK-NEXT: fmul v6.4s, v16.4s, v1.4s -; CHECK-NEXT: fmul v19.4s, v5.4s, v18.4s -; CHECK-NEXT: fmul v18.4s, v17.4s, v18.4s +; CHECK-NEXT: fmul v19.4s, v17.4s, v18.4s +; CHECK-NEXT: fmul v18.4s, v5.4s, v18.4s ; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[2], [x9] @@ -259,21 +259,21 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: ld1 { v20.s }[1], [x10] ; CHECK-NEXT: fneg v6.4s, v6.4s ; CHECK-NEXT: fneg v19.4s, v19.4s -; CHECK-NEXT: fmla v18.4s, v7.4s, v5.4s +; CHECK-NEXT: fmla v18.4s, v7.4s, v17.4s ; CHECK-NEXT: fmla v1.4s, v0.4s, v16.4s ; CHECK-NEXT: ld1 { v4.s }[3], [x9] ; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v2.s }[2], [x9] -; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: ldr s16, [sp, #200] ; CHECK-NEXT: add x9, sp, #216 ; CHECK-NEXT: add x10, sp, #184 ; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v19.4s, v7.4s, v17.4s -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v7.4s, v5.4s +; CHECK-NEXT: ld1 { v16.s }[1], [x9] ; CHECK-NEXT: fsub v0.4s, v4.4s, v1.4s ; CHECK-NEXT: fsub v1.4s, v20.4s, v18.4s ; CHECK-NEXT: ld1 { v2.s }[3], [x10] -; CHECK-NEXT: fadd v3.4s, v5.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v16.4s, v19.4s ; CHECK-NEXT: fadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index a93762918cd87..99f573795489a 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -645,27 +645,27 @@ define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: str s0, [x4] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ldp s1, s5, [x2] +; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umov w9, v1.h[0] -; CHECK-NEXT: umov w10, v1.h[1] -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: umov w9, v1.h[2] -; CHECK-NEXT: mov v2.b[9], w10 -; CHECK-NEXT: umov w10, v1.h[3] -; CHECK-NEXT: ldr s1, [x1] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v2.b[10], w9 +; CHECK-NEXT: ldp s0, s5, [x2] +; CHECK-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-NEXT: umov w9, v2.h[0] +; CHECK-NEXT: umov w10, v2.h[1] +; CHECK-NEXT: mov v0.b[8], w9 +; CHECK-NEXT: umov w9, v2.h[2] +; CHECK-NEXT: mov v0.b[9], w10 +; CHECK-NEXT: umov w10, v2.h[3] +; CHECK-NEXT: ldr s2, [x1] +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v2.b[11], w10 +; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b +; CHECK-NEXT: mov v0.b[11], w10 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ld1 { v2.s }[3], [x3], #4 +; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 ; CHECK-NEXT: ldr s4, [x0, #12] ; CHECK-NEXT: ldp s3, s16, [x0, #4] ; CHECK-NEXT: ldp s6, s7, [x2, #8] @@ -676,19 +676,19 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-NEXT: uaddl v1.8h, v3.8b, v4.8b +; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b ; CHECK-NEXT: ushll v3.8h, v6.8b, #0 ; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b -; CHECK-NEXT: uaddl v5.8h, v0.8b, v16.8b -; CHECK-NEXT: uaddw2 v2.8h, v3.8h, v2.16b -; CHECK-NEXT: ushll v0.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 +; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b +; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b +; CHECK-NEXT: ushll v0.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 ; CHECK-NEXT: ushll v6.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v5.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p store <4 x i8> %lp1, ptr %z @@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shuffle: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s2, s3, [x0, #8] +; CHECK-NEXT: ldp s2, s7, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldr s16, [x1, #12] +; CHECK-NEXT: ldr s18, [x1, #12] ; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s6, s7, [x0] +; CHECK-NEXT: ldp s3, s16, [x0] ; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: mov v4.16b, v3.16b -; CHECK-NEXT: ldp s17, s18, [x2, #8] +; CHECK-NEXT: mov v4.16b, v7.16b +; CHECK-NEXT: ldp s6, s17, [x2, #8] ; CHECK-NEXT: ldr s5, [x3, #12] -; CHECK-NEXT: mov v3.s[1], v16.s[0] +; CHECK-NEXT: mov v7.s[1], v18.s[0] ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: mov v4.s[1], v16.s[0] -; CHECK-NEXT: ld1 { v6.s }[1], [x1], #4 +; CHECK-NEXT: mov v4.s[1], v18.s[0] +; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v7.s }[1], [x1] -; CHECK-NEXT: mov v4.s[2], v18.s[0] -; CHECK-NEXT: mov v18.s[1], v5.s[0] -; CHECK-NEXT: uaddl v2.8h, v6.8b, v2.8b -; CHECK-NEXT: uaddl v6.8h, v0.8b, v17.8b -; CHECK-NEXT: uaddl v3.8h, v7.8b, v3.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v18.8b +; CHECK-NEXT: ld1 { v16.s }[1], [x1] +; CHECK-NEXT: mov v4.s[2], v17.s[0] +; CHECK-NEXT: mov v17.s[1], v5.s[0] +; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b +; CHECK-NEXT: uaddl v7.8h, v16.8b, v7.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v17.8b ; CHECK-NEXT: mov v4.s[3], v5.s[0] -; CHECK-NEXT: ushll v0.4s, v3.4h, #3 -; CHECK-NEXT: ushll v7.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 +; CHECK-NEXT: ushll v0.4s, v7.4h, #3 +; CHECK-NEXT: ushll v16.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: str q4, [x4] ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v6.8h -; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v6.8h +; CHECK-NEXT: uaddw v2.4s, v16.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -858,37 +858,37 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s1, s2, [x2] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: ldp s3, s5, [x0] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ldp s4, s5, [x2, #8] -; CHECK-NEXT: ldp s6, s7, [x0, #8] +; CHECK-NEXT: ldp s6, s0, [x2, #8] +; CHECK-NEXT: ldp s7, s4, [x0, #8] ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: ld1 { v7.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x9] -; CHECK-NEXT: ld1 { v4.s }[1], [x8] -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v0.s }[1], [x10] +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3] +; CHECK-NEXT: ld1 { v5.s }[1], [x1] +; CHECK-NEXT: ushll v16.8h, v0.8b, #0 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: ushll v0.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 -; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h -; CHECK-NEXT: ushll v4.8h, v7.8b, #0 -; CHECK-NEXT: stp q4, q5, [x4] +; CHECK-NEXT: uaddl v6.8h, v1.8b, v6.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b +; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll v7.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 +; CHECK-NEXT: stp q4, q16, [x4] +; CHECK-NEXT: ushll v1.4s, v5.4h, #3 +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v3.4h +; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v3.8h +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h +; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll index 6a450881dc978..6068a4742eea9 100644 --- a/llvm/test/CodeGen/AArch64/faddp-half.ll +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -223,15 +223,15 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: rev32 v5.8h, v0.8h ; CHECKNOFP16-NEXT: rev32 v4.8h, v1.8h -; CHECKNOFP16-NEXT: mov h2, v0.h[1] +; CHECKNOFP16-NEXT: mov h3, v0.h[1] ; CHECKNOFP16-NEXT: mov h6, v1.h[1] ; CHECKNOFP16-NEXT: fcvt s16, h0 ; CHECKNOFP16-NEXT: mov h17, v0.h[2] ; CHECKNOFP16-NEXT: fcvt s20, h1 ; CHECKNOFP16-NEXT: mov h21, v1.h[2] -; CHECKNOFP16-NEXT: mov h3, v5.h[1] +; CHECKNOFP16-NEXT: mov h2, v5.h[1] ; CHECKNOFP16-NEXT: mov h7, v4.h[1] -; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s18, h5 ; CHECKNOFP16-NEXT: mov h19, v5.h[2] ; CHECKNOFP16-NEXT: fcvt s6, h6 @@ -241,7 +241,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: mov h24, v5.h[3] ; CHECKNOFP16-NEXT: fcvt s21, h21 ; CHECKNOFP16-NEXT: mov h25, v4.h[6] -; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s7, h7 ; CHECKNOFP16-NEXT: fadd s16, s18, s16 ; CHECKNOFP16-NEXT: fcvt s18, h19 @@ -249,7 +249,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: fadd s20, s22, s20 ; CHECKNOFP16-NEXT: fcvt s22, h23 ; CHECKNOFP16-NEXT: mov h23, v4.h[3] -; CHECKNOFP16-NEXT: fadd s3, s3, s2 +; CHECKNOFP16-NEXT: fadd s3, s2, s3 ; CHECKNOFP16-NEXT: fadd s6, s7, s6 ; CHECKNOFP16-NEXT: mov h7, v1.h[3] ; CHECKNOFP16-NEXT: fcvt h2, s16 diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll index bfe8d173435c4..b5b9055fbc02f 100644 --- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll +++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll @@ -498,7 +498,7 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16: // %bb.0: ; CHECK-NO16-NEXT: mov h2, v0.h[4] ; CHECK-NO16-NEXT: mov h3, v0.h[5] -; CHECK-NO16-NEXT: mov w9, #32767 // =0x7fff +; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff ; CHECK-NO16-NEXT: mov h4, v0.h[6] ; CHECK-NO16-NEXT: fmov s1, #4.00000000 ; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000 @@ -512,82 +512,82 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16-NEXT: fcvt s4, h4 ; CHECK-NO16-NEXT: fcvt s5, h5 ; CHECK-NO16-NEXT: fcvt s6, h6 -; CHECK-NO16-NEXT: fcvt s0, h0 ; CHECK-NO16-NEXT: fmul s2, s2, s1 ; CHECK-NO16-NEXT: fmul s3, s3, s1 ; CHECK-NO16-NEXT: fmul s4, s4, s1 ; CHECK-NO16-NEXT: fmul s5, s5, s1 ; CHECK-NO16-NEXT: fmul s6, s6, s1 -; CHECK-NO16-NEXT: fmul s0, s0, s1 ; CHECK-NO16-NEXT: fcvt h2, s2 ; CHECK-NO16-NEXT: fcvt h3, s3 ; CHECK-NO16-NEXT: fcvt h4, s4 ; CHECK-NO16-NEXT: fcvt h5, s5 ; CHECK-NO16-NEXT: fcvt h6, s6 -; CHECK-NO16-NEXT: fcvt h0, s0 ; CHECK-NO16-NEXT: mov v2.h[1], v3.h[0] ; CHECK-NO16-NEXT: fcvt s3, h7 ; CHECK-NO16-NEXT: fmul s7, s16, s1 ; CHECK-NO16-NEXT: mov v2.h[2], v4.h[0] +; CHECK-NO16-NEXT: fcvt s4, h0 ; CHECK-NO16-NEXT: fmul s3, s3, s1 -; CHECK-NO16-NEXT: fcvt h4, s7 +; CHECK-NO16-NEXT: fcvt h0, s7 ; CHECK-NO16-NEXT: mov v2.h[3], v5.h[0] -; CHECK-NO16-NEXT: fcvt h1, s3 -; CHECK-NO16-NEXT: mov v4.h[1], v6.h[0] +; CHECK-NO16-NEXT: fmul s1, s4, s1 +; CHECK-NO16-NEXT: fcvt h3, s3 +; CHECK-NO16-NEXT: mov v0.h[1], v6.h[0] ; CHECK-NO16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-NO16-NEXT: mov v4.h[2], v1.h[0] -; CHECK-NO16-NEXT: mov s3, v2.s[1] -; CHECK-NO16-NEXT: mov v4.h[3], v0.h[0] -; CHECK-NO16-NEXT: mov s0, v2.s[2] +; CHECK-NO16-NEXT: fcvt h1, s1 +; CHECK-NO16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-NO16-NEXT: mov s4, v2.s[1] ; CHECK-NO16-NEXT: fcvtzs w10, s2 +; CHECK-NO16-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NO16-NEXT: mov s1, v2.s[2] ; CHECK-NO16-NEXT: mov s2, v2.s[3] -; CHECK-NO16-NEXT: fcvtzs w8, s3 -; CHECK-NO16-NEXT: fcvtl v1.4s, v4.4h -; CHECK-NO16-NEXT: fcvtzs w12, s0 +; CHECK-NO16-NEXT: fcvtzs w9, s4 +; CHECK-NO16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NO16-NEXT: fcvtzs w12, s1 ; CHECK-NO16-NEXT: fcvtzs w13, s2 -; CHECK-NO16-NEXT: cmp w8, w9 -; CHECK-NO16-NEXT: mov s0, v1.s[1] -; CHECK-NO16-NEXT: fcvtzs w15, s1 -; CHECK-NO16-NEXT: csel w8, w8, w9, lt -; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: csel w8, w8, w11, gt -; CHECK-NO16-NEXT: cmp w10, w9 -; CHECK-NO16-NEXT: csel w10, w10, w9, lt -; CHECK-NO16-NEXT: fcvtzs w14, s0 -; CHECK-NO16-NEXT: mov s0, v1.s[2] +; CHECK-NO16-NEXT: cmp w9, w8 +; CHECK-NO16-NEXT: csel w9, w9, w8, lt +; CHECK-NO16-NEXT: mov s1, v0.s[1] +; CHECK-NO16-NEXT: fcvtzs w15, s0 +; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: csel w9, w9, w11, gt +; CHECK-NO16-NEXT: cmp w10, w8 +; CHECK-NO16-NEXT: csel w10, w10, w8, lt ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: fcvtzs w14, s1 +; CHECK-NO16-NEXT: mov s1, v0.s[2] ; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w12, w9 -; CHECK-NO16-NEXT: csel w12, w12, w9, lt +; CHECK-NO16-NEXT: cmp w12, w8 +; CHECK-NO16-NEXT: mov s0, v0.s[3] +; CHECK-NO16-NEXT: csel w12, w12, w8, lt ; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: fcvtzs w16, s0 -; CHECK-NO16-NEXT: mov s0, v1.s[3] ; CHECK-NO16-NEXT: csel w12, w12, w11, gt -; CHECK-NO16-NEXT: cmp w13, w9 +; CHECK-NO16-NEXT: cmp w13, w8 +; CHECK-NO16-NEXT: fcvtzs w16, s1 +; CHECK-NO16-NEXT: csel w13, w13, w8, lt ; CHECK-NO16-NEXT: fmov s1, w10 -; CHECK-NO16-NEXT: csel w13, w13, w9, lt ; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w13, w13, w11, gt -; CHECK-NO16-NEXT: cmp w14, w9 -; CHECK-NO16-NEXT: mov v1.s[1], w8 -; CHECK-NO16-NEXT: csel w14, w14, w9, lt -; CHECK-NO16-NEXT: fcvtzs w8, s0 +; CHECK-NO16-NEXT: cmp w14, w8 +; CHECK-NO16-NEXT: csel w14, w14, w8, lt +; CHECK-NO16-NEXT: mov v1.s[1], w9 +; CHECK-NO16-NEXT: fcvtzs w9, s0 ; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w14, w14, w11, gt -; CHECK-NO16-NEXT: cmp w15, w9 -; CHECK-NO16-NEXT: csel w15, w15, w9, lt -; CHECK-NO16-NEXT: mov v1.s[2], w12 +; CHECK-NO16-NEXT: cmp w15, w8 +; CHECK-NO16-NEXT: csel w15, w15, w8, lt ; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: mov v1.s[2], w12 ; CHECK-NO16-NEXT: csel w10, w15, w11, gt -; CHECK-NO16-NEXT: cmp w16, w9 +; CHECK-NO16-NEXT: cmp w16, w8 ; CHECK-NO16-NEXT: fmov s2, w10 -; CHECK-NO16-NEXT: csel w10, w16, w9, lt +; CHECK-NO16-NEXT: csel w10, w16, w8, lt ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: mov v1.s[3], w13 ; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w8, w9 +; CHECK-NO16-NEXT: cmp w9, w8 +; CHECK-NO16-NEXT: mov v1.s[3], w13 ; CHECK-NO16-NEXT: mov v2.s[1], w14 -; CHECK-NO16-NEXT: csel w8, w8, w9, lt +; CHECK-NO16-NEXT: csel w8, w9, w8, lt ; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w8, w8, w11, gt ; CHECK-NO16-NEXT: mov v2.s[2], w10 diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index 80089288a0365..b7a645bfb546f 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -428,10 +428,10 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[2] ; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 ; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fdiv s6, s7, s6 -; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fdiv s7, s16, s7 +; CHECK-SD-NOFP16-NEXT: fdiv s7, s7, s6 +; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 +; CHECK-SD-NOFP16-NEXT: fdiv s6, s16, s6 ; CHECK-SD-NOFP16-NEXT: mov h16, v2.h[4] ; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 ; CHECK-SD-NOFP16-NEXT: fdiv s16, s17, s16 @@ -473,12 +473,12 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: fcvt h2, s20 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 ; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s6 +; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 ; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h5, s21 ; CHECK-SD-NOFP16-NEXT: fdiv s20, s25, s26 ; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h4, s6 ; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v5.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h5, s22 ; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v4.h[0] diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index 9766e22199377..79c99c48ce3dc 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -608,7 +608,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #64] // 4-byte Folded Spill +; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] @@ -626,17 +626,17 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: fmov s1, s12 ; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #64] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill @@ -649,7 +649,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload @@ -657,7 +657,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -775,15 +775,15 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h8, v0.h[1] -; CHECK-GI-NEXT: mov h9, v0.h[2] -; CHECK-GI-NEXT: mov h10, v0.h[3] -; CHECK-GI-NEXT: mov h11, v0.h[4] +; CHECK-GI-NEXT: mov h9, v0.h[1] +; CHECK-GI-NEXT: mov h10, v0.h[2] +; CHECK-GI-NEXT: mov h11, v0.h[3] +; CHECK-GI-NEXT: mov h12, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h13, v1.h[3] -; CHECK-GI-NEXT: mov h12, v1.h[4] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h8, v1.h[3] +; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill @@ -793,34 +793,34 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h8 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h12 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #172] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -833,19 +833,20 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] @@ -1079,17 +1080,17 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h10, v0.h[1] -; CHECK-GI-NEXT: mov h11, v0.h[2] -; CHECK-GI-NEXT: mov h12, v0.h[3] -; CHECK-GI-NEXT: mov h13, v0.h[4] +; CHECK-GI-NEXT: mov h11, v0.h[1] +; CHECK-GI-NEXT: mov h12, v0.h[2] +; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h14, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h15, v1.h[3] -; CHECK-GI-NEXT: mov h14, v1.h[4] +; CHECK-GI-NEXT: mov h10, v1.h[3] +; CHECK-GI-NEXT: mov h15, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1101,27 +1102,27 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1133,10 +1134,10 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1149,7 +1150,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1164,7 +1165,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1367,27 +1368,27 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h1, v0.h[4] +; CHECK-GI-NEXT: mov h12, v0.h[1] +; CHECK-GI-NEXT: mov h13, v0.h[2] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h14, v0.h[3] ; CHECK-GI-NEXT: mov h15, v2.h[1] ; CHECK-GI-NEXT: mov h8, v2.h[2] ; CHECK-GI-NEXT: mov h9, v2.h[3] ; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h14, v2.h[5] -; CHECK-GI-NEXT: str h4, [sp, #288] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[5] -; CHECK-GI-NEXT: str h4, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[6] -; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[7] +; CHECK-GI-NEXT: mov h11, v2.h[5] +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[5] +; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[6] +; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h4, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[1] ; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] ; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill @@ -1398,7 +1399,7 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v4.h[5] ; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #320] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] @@ -1416,40 +1417,40 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v3.h[5] ; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #222] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[7] -; CHECK-GI-NEXT: str h1, [sp, #286] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #288] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #288] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1517,11 +1518,11 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #320] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #222] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1529,47 +1530,46 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #286] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q3, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 ; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #448 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index a36a58660cd40..92fd3183393ea 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2377,114 +2377,114 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-CVT-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-CVT-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 +; CHECK-CVT-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: fcvt s3, h1 ; CHECK-CVT-NEXT: mov h4, v1.h[2] ; CHECK-CVT-NEXT: mov h1, v1.h[3] ; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvtzs x9, s3 +; CHECK-CVT-NEXT: fcvtzs x10, s3 ; CHECK-CVT-NEXT: fcvt s3, h4 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: fcvtzs x10, s2 -; CHECK-CVT-NEXT: cmp x9, x8 +; CHECK-CVT-NEXT: fcvtzs x11, s2 +; CHECK-CVT-NEXT: cmp x10, x8 ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: csel x9, x9, x8, lt +; CHECK-CVT-NEXT: csel x10, x10, x8, lt ; CHECK-CVT-NEXT: mov h2, v0.h[1] ; CHECK-CVT-NEXT: fcvt s3, h0 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x4, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s1 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x4, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s1 ; CHECK-CVT-NEXT: mov h1, v0.h[2] -; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: cmp x10, x9 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: csel x5, x9, x11, gt +; CHECK-CVT-NEXT: csel x5, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: cmp x10, x9 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: csel x6, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: csel x6, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s2 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x7, x9, x11, gt +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s2 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x7, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s1 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x0, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s0 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x1, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x0, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s0 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x1, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x2, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x8, x10, x8, lt -; CHECK-CVT-NEXT: cmp x8, x11 -; CHECK-CVT-NEXT: csel x3, x8, x11, gt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x2, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x8, x11, x8, lt +; CHECK-CVT-NEXT: cmp x8, x9 +; CHECK-CVT-NEXT: csel x3, x8, x9, gt ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v8f16_v8i50: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-FP16-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-FP16-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 +; CHECK-FP16-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-FP16-NEXT: mov h2, v1.h[1] -; CHECK-FP16-NEXT: fcvtzs x9, h1 +; CHECK-FP16-NEXT: fcvtzs x10, h1 ; CHECK-FP16-NEXT: mov h3, v1.h[2] ; CHECK-FP16-NEXT: mov h1, v1.h[3] -; CHECK-FP16-NEXT: fcvtzs x10, h2 -; CHECK-FP16-NEXT: cmp x9, x8 +; CHECK-FP16-NEXT: fcvtzs x11, h2 +; CHECK-FP16-NEXT: cmp x10, x8 ; CHECK-FP16-NEXT: fcvtzs x12, h3 -; CHECK-FP16-NEXT: csel x9, x9, x8, lt +; CHECK-FP16-NEXT: csel x10, x10, x8, lt ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x4, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h1 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x4, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h1 ; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x5, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x5, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h0 ; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x6, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h1 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x7, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x6, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h1 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x7, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h2 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x0, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h0 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x1, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x0, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h0 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x1, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x2, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x8, x10, x8, lt -; CHECK-FP16-NEXT: cmp x8, x11 -; CHECK-FP16-NEXT: csel x3, x8, x11, gt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x2, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x8, x11, x8, lt +; CHECK-FP16-NEXT: cmp x8, x9 +; CHECK-FP16-NEXT: csel x3, x8, x9, gt ; CHECK-FP16-NEXT: ret %x = call <8 x i50> @llvm.fptosi.sat.v8f16.v8i50(<8 x half> %f) ret <8 x i50> %x @@ -2596,11 +2596,11 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1895825407 // =0x70ffffff ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x21, #-34359738368 // =0xfffffff800000000 +; CHECK-NEXT: mov x22, #-34359738368 // =0xfffffff800000000 ; CHECK-NEXT: mov x23, #34359738367 // =0x7ffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le @@ -2616,7 +2616,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2630,7 +2630,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le @@ -2645,14 +2645,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[1] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x27, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2660,60 +2660,60 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x27, xzr, x8, vs -; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: csel x20, xzr, x8, vs +; CHECK-NEXT: csel x21, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x22, xzr, x8, vs -; CHECK-NEXT: csel x29, xzr, x9, vs +; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x24, xzr, x8, vs -; CHECK-NEXT: csel x25, xzr, x9, vs +; CHECK-NEXT: csel x25, xzr, x8, vs +; CHECK-NEXT: csel x29, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; CHECK-NEXT: extr x8, x29, x22, #28 +; CHECK-NEXT: extr x8, x24, x28, #28 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: bfi x24, x20, #36, #28 -; CHECK-NEXT: lsr x11, x27, #28 +; CHECK-NEXT: bfi x25, x21, #36, #28 +; CHECK-NEXT: lsr x11, x20, #28 ; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x27, x20, #28 +; CHECK-NEXT: extr x9, x20, x21, #28 ; CHECK-NEXT: stur x8, [x19, #41] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: str x9, [x19, #16] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: stp x25, x24, [x19] +; CHECK-NEXT: stp x29, x25, [x19] ; CHECK-NEXT: stur x10, [x19, #50] -; CHECK-NEXT: lsr x10, x29, #28 +; CHECK-NEXT: lsr x10, x24, #28 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 @@ -2723,9 +2723,9 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: bfi x8, x22, #36, #28 +; CHECK-NEXT: bfi x8, x28, #36, #28 ; CHECK-NEXT: extr x10, x14, x12, #28 -; CHECK-NEXT: bfi x28, x12, #36, #28 +; CHECK-NEXT: bfi x27, x12, #36, #28 ; CHECK-NEXT: ldr x12, [sp, #72] // 8-byte Folded Reload ; CHECK-NEXT: bfi x26, x13, #36, #28 ; CHECK-NEXT: stur x9, [x19, #25] @@ -2734,7 +2734,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: stur x8, [x19, #33] ; CHECK-NEXT: lsr x8, x12, #28 ; CHECK-NEXT: stur x10, [x19, #91] -; CHECK-NEXT: stur x28, [x19, #83] +; CHECK-NEXT: stur x27, [x19, #83] ; CHECK-NEXT: stur x11, [x19, #66] ; CHECK-NEXT: stur x26, [x19, #58] ; CHECK-NEXT: strb w9, [x19, #99] @@ -2792,14 +2792,14 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: mov w8, #2130706431 // =0x7effffff ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fmov s10, w8 -; CHECK-NEXT: mov x23, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: mov x22, #9223372036854775807 // =0x7fffffffffffffff +; CHECK-NEXT: mov x22, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NEXT: mov x23, #9223372036854775807 // =0x7fffffffffffffff ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2813,9 +2813,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2828,9 +2828,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2843,9 +2843,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2857,9 +2857,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2871,9 +2871,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2885,9 +2885,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2900,10 +2900,10 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: stp x24, x25, [x19, #16] ; CHECK-NEXT: stp x20, x21, [x19] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: stp x28, x29, [x19, #112] -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x9, xzr, x9, vs @@ -3030,6 +3030,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 +; CHECK-CVT-NEXT: fcvtzs w16, s1 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] @@ -3041,6 +3042,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w14, s2 ; CHECK-CVT-NEXT: cmn w11, #128 ; CHECK-CVT-NEXT: mov s2, v1.s[2] +; CHECK-CVT-NEXT: mov s1, v1.s[3] ; CHECK-CVT-NEXT: csel w11, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, #127 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt @@ -3050,56 +3052,53 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: csel w13, w10, w9, gt ; CHECK-CVT-NEXT: cmp w12, #127 -; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: fcvtzs w17, s1 ; CHECK-CVT-NEXT: csel w10, w12, w8, lt ; CHECK-CVT-NEXT: cmn w10, #128 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: mov s1, v3.s[2] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, #127 ; CHECK-CVT-NEXT: fcvtzs w4, s0 ; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: fcvtzs w14, s1 -; CHECK-CVT-NEXT: mov s1, v1.s[3] ; CHECK-CVT-NEXT: cmn w12, #128 ; CHECK-CVT-NEXT: csel w12, w12, w9, gt ; CHECK-CVT-NEXT: cmp w15, #127 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: cmn w15, #128 -; CHECK-CVT-NEXT: fcvtzs w17, s1 -; CHECK-CVT-NEXT: mov s1, v3.s[2] -; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w14, #127 -; CHECK-CVT-NEXT: csel w14, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w1, s1 +; CHECK-CVT-NEXT: csel w14, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: cmn w14, #128 -; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: mov s1, v0.s[1] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt ; CHECK-CVT-NEXT: cmp w16, #127 -; CHECK-CVT-NEXT: fcvtzs w1, s1 ; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: mov s1, v0.s[1] ; CHECK-CVT-NEXT: cmn w16, #128 -; CHECK-CVT-NEXT: mov v2.s[1], w11 +; CHECK-CVT-NEXT: fcvtzs w18, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w15, #127 +; CHECK-CVT-NEXT: fcvtzs w3, s1 +; CHECK-CVT-NEXT: csel w15, w15, w8, lt +; CHECK-CVT-NEXT: mov s1, v0.s[2] +; CHECK-CVT-NEXT: mov s0, v0.s[3] +; CHECK-CVT-NEXT: cmn w15, #128 +; CHECK-CVT-NEXT: csel w15, w15, w9, gt ; CHECK-CVT-NEXT: cmp w17, #127 +; CHECK-CVT-NEXT: fcvtzs w2, s2 ; CHECK-CVT-NEXT: csel w17, w17, w8, lt +; CHECK-CVT-NEXT: fmov s2, w13 ; CHECK-CVT-NEXT: cmn w17, #128 -; CHECK-CVT-NEXT: fcvtzs w3, s1 -; CHECK-CVT-NEXT: mov s1, v0.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, #127 -; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: csel w18, w18, w8, lt -; CHECK-CVT-NEXT: mov v2.s[2], w10 +; CHECK-CVT-NEXT: mov v2.s[1], w11 ; CHECK-CVT-NEXT: cmn w18, #128 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, #127 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt ; CHECK-CVT-NEXT: cmn w0, #128 -; CHECK-CVT-NEXT: mov v2.s[3], w12 +; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, #127 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt @@ -3107,6 +3106,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmn w1, #128 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, #127 +; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #128 @@ -3119,18 +3119,18 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: csel w3, w4, w8, lt ; CHECK-CVT-NEXT: fcvtzs w4, s1 -; CHECK-CVT-NEXT: fmov s1, w14 +; CHECK-CVT-NEXT: fmov s1, w16 ; CHECK-CVT-NEXT: cmn w3, #128 ; CHECK-CVT-NEXT: csel w11, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 ; CHECK-CVT-NEXT: fmov s4, w11 -; CHECK-CVT-NEXT: mov v1.s[1], w15 +; CHECK-CVT-NEXT: mov v1.s[1], w14 ; CHECK-CVT-NEXT: fcvtzs w11, s0 ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #128 -; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: mov v1.s[2], w15 ; CHECK-CVT-NEXT: csel w10, w13, w9, gt ; CHECK-CVT-NEXT: cmp w11, #127 ; CHECK-CVT-NEXT: csel w8, w11, w8, lt @@ -3163,6 +3163,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 +; CHECK-CVT-NEXT: fcvtzs w16, s0 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] @@ -3174,6 +3175,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w14, s2 ; CHECK-CVT-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: mov s2, v0.s[2] +; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: csel w11, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, w8 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt @@ -3183,55 +3185,52 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: csel w13, w10, w9, gt ; CHECK-CVT-NEXT: cmp w12, w8 -; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: fcvtzs w17, s0 ; CHECK-CVT-NEXT: csel w10, w12, w8, lt ; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: mov s0, v3.s[2] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, w8 ; CHECK-CVT-NEXT: fcvtzs w4, s1 ; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: fcvtzs w14, s0 -; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w12, w12, w9, gt ; CHECK-CVT-NEXT: cmp w15, w8 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w17, s0 -; CHECK-CVT-NEXT: mov s0, v3.s[2] -; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w14, w8 -; CHECK-CVT-NEXT: csel w14, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w1, s0 +; CHECK-CVT-NEXT: csel w14, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: mov s0, v1.s[1] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt ; CHECK-CVT-NEXT: cmp w16, w8 -; CHECK-CVT-NEXT: fcvtzs w1, s0 ; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: mov s0, v1.s[1] ; CHECK-CVT-NEXT: cmn w16, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov v2.s[1], w11 +; CHECK-CVT-NEXT: fcvtzs w18, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w15, w8 +; CHECK-CVT-NEXT: fcvtzs w3, s0 +; CHECK-CVT-NEXT: csel w15, w15, w8, lt +; CHECK-CVT-NEXT: mov s0, v1.s[2] +; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: csel w15, w15, w9, gt ; CHECK-CVT-NEXT: cmp w17, w8 +; CHECK-CVT-NEXT: fcvtzs w2, s2 ; CHECK-CVT-NEXT: csel w17, w17, w8, lt +; CHECK-CVT-NEXT: fmov s2, w13 ; CHECK-CVT-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w3, s0 -; CHECK-CVT-NEXT: mov s0, v1.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, w8 -; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w18, w18, w8, lt +; CHECK-CVT-NEXT: mov v2.s[1], w11 ; CHECK-CVT-NEXT: cmn w18, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, w8 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt -; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, w8 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt @@ -3239,6 +3238,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: cmn w1, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, w8 +; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #8, lsl #12 // =32768 @@ -3253,18 +3253,18 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w4, s0 ; CHECK-CVT-NEXT: mov s0, v1.s[3] ; CHECK-CVT-NEXT: cmn w3, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fmov s1, w14 +; CHECK-CVT-NEXT: fmov s1, w16 ; CHECK-CVT-NEXT: csel w11, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 ; CHECK-CVT-NEXT: fmov s4, w11 -; CHECK-CVT-NEXT: mov v1.s[1], w15 +; CHECK-CVT-NEXT: mov v1.s[1], w14 ; CHECK-CVT-NEXT: cmp w4, w8 ; CHECK-CVT-NEXT: fcvtzs w11, s0 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w10, w13, w9, gt -; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: mov v1.s[2], w15 ; CHECK-CVT-NEXT: cmp w11, w8 ; CHECK-CVT-NEXT: csel w8, w11, w8, lt ; CHECK-CVT-NEXT: mov v4.s[2], w10 @@ -3289,9 +3289,8 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: fcvtzs w9, d3 -; CHECK-NEXT: mov w10, #127 // =0x7f -; CHECK-NEXT: mov w11, #-128 // =0xffffff80 +; CHECK-NEXT: fcvtzs w11, d3 +; CHECK-NEXT: mov w9, #127 // =0x7f ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 @@ -3303,47 +3302,48 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-NEXT: cmp w8, #127 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #128 -; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: cmp w9, #127 -; CHECK-NEXT: csel w9, w9, w10, lt -; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: csel w10, w8, w9, lt +; CHECK-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: csel w10, w10, w8, gt +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w9, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w11, w11, w8, gt ; CHECK-NEXT: cmp w12, #127 -; CHECK-NEXT: csel w12, w12, w10, lt -; CHECK-NEXT: fmov s3, w9 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: fmov s3, w11 ; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w11, gt +; CHECK-NEXT: csel w12, w12, w8, gt ; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: csel w13, w13, w10, lt -; CHECK-NEXT: mov v3.s[1], w8 +; CHECK-NEXT: csel w13, w13, w9, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: csel w13, w13, w8, gt ; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: csel w14, w14, w10, lt +; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w14, w14, w11, gt +; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: csel w15, w15, w10, lt +; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #128 -; CHECK-NEXT: csel w15, w15, w11, gt +; CHECK-NEXT: csel w15, w15, w8, gt ; CHECK-NEXT: cmp w16, #127 -; CHECK-NEXT: csel w9, w16, w10, lt +; CHECK-NEXT: csel w11, w16, w9, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w8, w9, w11, gt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w10, w11, w8, gt ; CHECK-NEXT: cmp w17, #127 -; CHECK-NEXT: csel w9, w17, w10, lt +; CHECK-NEXT: csel w9, w17, w9, lt ; CHECK-NEXT: mov v1.s[1], w14 ; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w9, w9, w11, gt -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: csel w8, w9, w8, gt +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: adrp x8, .LCPI82_0 ; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI82_0] +; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) @@ -3491,61 +3491,61 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: mov w9, #32767 // =0x7fff -; CHECK-NEXT: fcvtzs w10, d3 -; CHECK-NEXT: mov w11, #-32768 // =0xffff8000 +; CHECK-NEXT: mov w8, #32767 // =0x7fff +; CHECK-NEXT: fcvtzs w11, d3 ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: fcvtzs w17, d0 -; CHECK-NEXT: fcvtzs w8, d4 +; CHECK-NEXT: fcvtzs w9, d4 ; CHECK-NEXT: mov d4, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fcvtzs w14, d3 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: csel w10, w10, w9, lt +; CHECK-NEXT: csel w10, w9, w8, lt +; CHECK-NEXT: mov w9, #-32768 // =0xffff8000 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w10, w10, w11, gt -; CHECK-NEXT: cmp w12, w9 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: fmov s3, w10 +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w11, w11, w9, gt +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: fmov s3, w11 ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w12, w12, w11, gt -; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: csel w13, w13, w9, lt -; CHECK-NEXT: mov v3.s[1], w8 +; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w11, gt -; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: csel w14, w14, w9, lt +; CHECK-NEXT: csel w13, w13, w9, gt +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w14, w14, w8, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w11, gt -; CHECK-NEXT: cmp w15, w9 -; CHECK-NEXT: csel w15, w15, w9, lt +; CHECK-NEXT: csel w14, w14, w9, gt +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: csel w15, w15, w8, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w15, w15, w11, gt -; CHECK-NEXT: cmp w16, w9 -; CHECK-NEXT: csel w10, w16, w9, lt +; CHECK-NEXT: csel w15, w15, w9, gt +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: csel w11, w16, w8, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w8, w10, w11, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w9, w17, w9, lt +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w10, w11, w9, gt +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: csel w8, w17, w8, lt ; CHECK-NEXT: mov v1.s[1], w14 -; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w9, w9, w11, gt -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w8, w8, w9, gt +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: adrp x8, .LCPI84_0 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI84_0] +; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) @@ -3563,7 +3563,7 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: mov d1, v7.d[1] ; CHECK-NEXT: fcvtzs w18, d0 -; CHECK-NEXT: fcvtzs w0, d7 +; CHECK-NEXT: fcvtzs w1, d7 ; CHECK-NEXT: fcvtzs w2, d6 ; CHECK-NEXT: fcvtzs w4, d5 ; CHECK-NEXT: fcvtzs w6, d4 @@ -3571,30 +3571,26 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov d16, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: mov d0, v6.d[1] +; CHECK-NEXT: fcvtzs w0, d1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: fcvtzs w13, d16 ; CHECK-NEXT: fcvtzs w17, d2 ; CHECK-NEXT: csel w10, w8, w9, lt ; CHECK-NEXT: mov w8, #-32768 // =0xffff8000 -; CHECK-NEXT: fcvtzs w1, d0 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: mov d0, v5.d[1] ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w11, w9 ; CHECK-NEXT: csel w11, w11, w9, lt ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w11, w8, gt ; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: fcvtzs w3, d0 ; CHECK-NEXT: csel w11, w13, w9, lt ; CHECK-NEXT: fcvtzs w13, d3 -; CHECK-NEXT: mov d0, v4.d[1] ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w11, w11, w8, gt ; CHECK-NEXT: cmp w14, w9 ; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w5, d0 ; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w13, w9 ; CHECK-NEXT: csel w13, w13, w9, lt @@ -3606,60 +3602,64 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: csel w16, w15, w8, gt ; CHECK-NEXT: cmp w17, w9 ; CHECK-NEXT: csel w15, w17, w9, lt -; CHECK-NEXT: fcvtzs w17, d1 -; CHECK-NEXT: fmov s3, w12 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w15, w15, w8, gt ; CHECK-NEXT: cmp w18, w9 -; CHECK-NEXT: csel w18, w18, w9, lt -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov s2, w14 -; CHECK-NEXT: csel w18, w18, w8, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w17, w17, w9, lt +; CHECK-NEXT: csel w17, w18, w9, lt ; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v2.s[1], w11 ; CHECK-NEXT: csel w17, w17, w8, gt ; CHECK-NEXT: cmp w0, w9 -; CHECK-NEXT: fmov s1, w16 -; CHECK-NEXT: csel w0, w0, w9, lt -; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w0, w0, w8, gt +; CHECK-NEXT: csel w18, w0, w9, lt +; CHECK-NEXT: fcvtzs w0, d0 +; CHECK-NEXT: mov d0, v5.d[1] +; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w18, w18, w8, gt ; CHECK-NEXT: cmp w1, w9 -; CHECK-NEXT: mov v1.s[1], w13 ; CHECK-NEXT: csel w1, w1, w9, lt -; CHECK-NEXT: fmov s7, w0 -; CHECK-NEXT: fmov s0, w18 ; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w3, d0 +; CHECK-NEXT: mov d0, v4.d[1] ; CHECK-NEXT: csel w1, w1, w8, gt +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: csel w0, w0, w9, lt +; CHECK-NEXT: fmov s7, w1 +; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w0, w0, w8, gt ; CHECK-NEXT: cmp w2, w9 +; CHECK-NEXT: fcvtzs w5, d0 ; CHECK-NEXT: csel w2, w2, w9, lt -; CHECK-NEXT: mov v7.s[1], w17 -; CHECK-NEXT: mov v0.s[1], w15 +; CHECK-NEXT: fmov s3, w12 +; CHECK-NEXT: mov v7.s[1], w18 ; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w2, w2, w8, gt ; CHECK-NEXT: cmp w3, w9 ; CHECK-NEXT: csel w3, w3, w9, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fmov s6, w2 ; CHECK-NEXT: cmn w3, #8, lsl #12 // =32768 +; CHECK-NEXT: fmov s2, w14 ; CHECK-NEXT: csel w3, w3, w8, gt ; CHECK-NEXT: cmp w4, w9 ; CHECK-NEXT: csel w4, w4, w9, lt -; CHECK-NEXT: mov v6.s[1], w1 +; CHECK-NEXT: mov v6.s[1], w0 ; CHECK-NEXT: cmn w4, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v2.s[1], w11 ; CHECK-NEXT: csel w12, w4, w8, gt ; CHECK-NEXT: cmp w5, w9 +; CHECK-NEXT: fmov s1, w16 ; CHECK-NEXT: csel w10, w5, w9, lt ; CHECK-NEXT: fmov s5, w12 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w6, w9 +; CHECK-NEXT: mov v1.s[1], w13 ; CHECK-NEXT: csel w9, w6, w9, lt ; CHECK-NEXT: mov v5.s[1], w3 +; CHECK-NEXT: fmov s0, w17 ; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w8, w9, w8, gt ; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: mov v0.s[1], w15 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI85_0] ; CHECK-NEXT: mov v4.s[1], w10 diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index f23254cbf7b22..c94db3484994c 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2196,13 +2196,13 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x22, #68719476735 // =0xfffffffff +; CHECK-NEXT: mov x23, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x10, x22, x8, gt +; CHECK-NEXT: csel x10, x23, x8, gt ; CHECK-NEXT: csinv x8, x9, xzr, le ; CHECK-NEXT: stp x8, x10, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2213,9 +2213,9 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x9, x22, x9, gt -; CHECK-NEXT: csinv x24, x8, xzr, le -; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: stp x8, x9, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload @@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x25, x22, x9, gt +; CHECK-NEXT: csel x25, x23, x9, gt ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2237,9 +2237,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x26, x22, x9, gt -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill +; CHECK-NEXT: csel x26, x23, x9, gt +; CHECK-NEXT: csinv x28, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2249,8 +2248,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x29, x22, x9, gt -; CHECK-NEXT: csinv x27, x8, xzr, le +; CHECK-NEXT: csel x29, x23, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 @@ -2259,8 +2258,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x20, x22, x9, gt -; CHECK-NEXT: csinv x21, x8, xzr, le +; CHECK-NEXT: csel x21, x23, x9, gt +; CHECK-NEXT: csinv x27, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2270,36 +2269,35 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, x22, x9, gt -; CHECK-NEXT: csinv x23, x8, xzr, le +; CHECK-NEXT: csel x22, x23, x9, gt +; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; CHECK-NEXT: extr x8, x20, x21, #28 +; CHECK-NEXT: extr x8, x21, x27, #28 +; CHECK-NEXT: extr x9, x29, x20, #28 +; CHECK-NEXT: stur x28, [x19, #75] ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: bfi x28, x27, #36, #28 +; CHECK-NEXT: bfi x22, x20, #36, #28 ; CHECK-NEXT: lsr x11, x29, #28 -; CHECK-NEXT: bfi x26, x24, #36, #28 -; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x29, x27, #28 ; CHECK-NEXT: stur x8, [x19, #41] -; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: str x9, [x19, #16] +; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: stp x23, x28, [x19] -; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: stp x24, x22, [x19] ; CHECK-NEXT: stur x10, [x19, #50] -; CHECK-NEXT: lsr x10, x20, #28 -; CHECK-NEXT: csel x9, x22, x9, gt -; CHECK-NEXT: bfi x9, x21, #36, #28 -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: lsr x10, x21, #28 +; CHECK-NEXT: strb w11, [x19, #24] ; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: ldp x12, x11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bfi x9, x27, #36, #28 ; CHECK-NEXT: stur x8, [x19, #25] ; CHECK-NEXT: stur x9, [x19, #33] -; CHECK-NEXT: extr x10, x11, x24, #28 +; CHECK-NEXT: extr x10, x11, x12, #28 +; CHECK-NEXT: bfi x26, x12, #36, #28 ; CHECK-NEXT: stur x10, [x19, #91] ; CHECK-NEXT: ldp x10, x9, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stur x26, [x19, #83] @@ -2849,56 +2847,56 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: fcvtzu w9, d16 +; CHECK-NEXT: mov d16, v5.d[1] ; CHECK-NEXT: mov v0.b[5], w11 ; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: mov d4, v5.d[1] ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: mov w11, v3.s[1] ; CHECK-NEXT: mov v0.b[6], v3.b[0] ; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: mov d4, v6.d[1] +; CHECK-NEXT: fmov s4, w10 +; CHECK-NEXT: fcvtzu w10, d16 ; CHECK-NEXT: mov v0.b[7], w11 -; CHECK-NEXT: mov v16.s[1], w9 +; CHECK-NEXT: mov v4.s[1], w9 ; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: mov d5, v6.d[1] ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: mov w11, v16.s[1] -; CHECK-NEXT: mov v0.b[8], v16.b[0] +; CHECK-NEXT: mov w11, v4.s[1] +; CHECK-NEXT: mov v0.b[8], v4.b[0] ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: mov d4, v7.d[1] +; CHECK-NEXT: fmov s16, w9 +; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: mov d5, v7.d[1] ; CHECK-NEXT: mov v0.b[9], w11 -; CHECK-NEXT: mov v5.s[1], w10 +; CHECK-NEXT: mov v16.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d6 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov v0.b[10], v5.b[0] -; CHECK-NEXT: mov w11, v5.s[1] +; CHECK-NEXT: mov v0.b[10], v16.b[0] +; CHECK-NEXT: mov w11, v16.s[1] ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: fcvtzu w10, d7 ; CHECK-NEXT: mov v0.b[11], w11 ; CHECK-NEXT: mov v6.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: fcvtzu w9, d5 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: mov v0.b[12], v6.b[0] ; CHECK-NEXT: mov w11, v6.s[1] ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w8, w10, w8, lo -; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fmov s5, w8 ; CHECK-NEXT: mov v0.b[13], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: mov v0.b[14], v4.b[0] -; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: mov v5.s[1], w9 +; CHECK-NEXT: mov v0.b[14], v5.b[0] +; CHECK-NEXT: mov w8, v5.s[1] ; CHECK-NEXT: mov v0.b[15], w8 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) @@ -2961,76 +2959,76 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: mov d3, v1.d[1] -; CHECK-NEXT: fcvtzu w11, d1 +; CHECK-NEXT: fcvtzu w10, d1 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: fcvtzu w13, d0 +; CHECK-NEXT: fcvtzu w11, d2 +; CHECK-NEXT: fcvtzu w12, d0 ; CHECK-NEXT: mov d0, v7.d[1] ; CHECK-NEXT: mov d2, v6.d[1] -; CHECK-NEXT: fcvtzu w15, d7 -; CHECK-NEXT: fcvtzu w12, d16 -; CHECK-NEXT: fcvtzu w14, d17 -; CHECK-NEXT: fcvtzu w16, d6 +; CHECK-NEXT: fcvtzu w14, d7 +; CHECK-NEXT: fcvtzu w13, d16 +; CHECK-NEXT: fcvtzu w16, d17 +; CHECK-NEXT: fcvtzu w15, d6 ; CHECK-NEXT: fcvtzu w17, d3 ; CHECK-NEXT: mov d6, v5.d[1] ; CHECK-NEXT: mov d3, v4.d[1] ; CHECK-NEXT: fcvtzu w18, d1 -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: csel w13, w13, w8, lo ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: cmp w16, w8 ; CHECK-NEXT: fmov s19, w9 -; CHECK-NEXT: csel w9, w14, w8, lo -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: fcvtzu w14, d0 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: mov v19.s[1], w12 -; CHECK-NEXT: csel w12, w17, w8, lo +; CHECK-NEXT: csel w9, w16, w8, lo ; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: fcvtzu w16, d0 ; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: mov v19.s[1], w13 +; CHECK-NEXT: csel w13, w17, w8, lo +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w18, w8 -; CHECK-NEXT: fmov s18, w10 -; CHECK-NEXT: csel w10, w18, w8, lo -; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: fmov s18, w11 +; CHECK-NEXT: csel w11, w18, w8, lo +; CHECK-NEXT: cmp w12, w8 ; CHECK-NEXT: fcvtzu w17, d2 -; CHECK-NEXT: csel w13, w13, w8, lo -; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w16, w8 ; CHECK-NEXT: fcvtzu w18, d6 ; CHECK-NEXT: mov v18.s[1], w9 -; CHECK-NEXT: csel w9, w14, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fmov s17, w11 -; CHECK-NEXT: csel w11, w15, w8, lo -; CHECK-NEXT: fcvtzu w14, d5 -; CHECK-NEXT: fmov s23, w11 +; CHECK-NEXT: csel w9, w16, w8, lo +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: fmov s17, w10 +; CHECK-NEXT: csel w10, w14, w8, lo +; CHECK-NEXT: fcvtzu w16, d5 +; CHECK-NEXT: fmov s23, w10 ; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: fcvtzu w15, d3 -; CHECK-NEXT: csel w11, w17, w8, lo -; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fcvtzu w14, d3 +; CHECK-NEXT: csel w10, w17, w8, lo +; CHECK-NEXT: cmp w15, w8 ; CHECK-NEXT: fcvtzu w17, d4 -; CHECK-NEXT: mov v17.s[1], w12 +; CHECK-NEXT: mov v17.s[1], w13 ; CHECK-NEXT: mov v23.s[1], w9 -; CHECK-NEXT: csel w9, w16, w8, lo +; CHECK-NEXT: csel w9, w15, w8, lo ; CHECK-NEXT: cmp w18, w8 ; CHECK-NEXT: fmov s22, w9 ; CHECK-NEXT: csel w9, w18, w8, lo +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fmov s16, w12 +; CHECK-NEXT: mov v22.s[1], w10 +; CHECK-NEXT: csel w10, w16, w8, lo ; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: fmov s16, w13 -; CHECK-NEXT: mov v22.s[1], w11 -; CHECK-NEXT: csel w11, w14, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fmov s21, w11 -; CHECK-NEXT: csel w11, w15, w8, lo +; CHECK-NEXT: fmov s21, w10 +; CHECK-NEXT: csel w10, w14, w8, lo ; CHECK-NEXT: cmp w17, w8 ; CHECK-NEXT: csel w8, w17, w8, lo -; CHECK-NEXT: mov v16.s[1], w10 +; CHECK-NEXT: mov v16.s[1], w11 ; CHECK-NEXT: mov v21.s[1], w9 ; CHECK-NEXT: fmov s20, w8 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI85_0] -; CHECK-NEXT: mov v20.s[1], w11 +; CHECK-NEXT: mov v20.s[1], w10 ; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b ; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index 90e93577efd9f..16a6ba3f8cc93 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -610,7 +610,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #64] // 4-byte Folded Spill +; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] @@ -628,17 +628,17 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fmov s1, s12 ; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #64] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill @@ -651,7 +651,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload @@ -659,7 +659,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -777,15 +777,15 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h8, v0.h[1] -; CHECK-GI-NEXT: mov h9, v0.h[2] -; CHECK-GI-NEXT: mov h10, v0.h[3] -; CHECK-GI-NEXT: mov h11, v0.h[4] +; CHECK-GI-NEXT: mov h9, v0.h[1] +; CHECK-GI-NEXT: mov h10, v0.h[2] +; CHECK-GI-NEXT: mov h11, v0.h[3] +; CHECK-GI-NEXT: mov h12, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h13, v1.h[3] -; CHECK-GI-NEXT: mov h12, v1.h[4] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h8, v1.h[3] +; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill @@ -795,34 +795,34 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h8 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h12 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #172] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -835,19 +835,20 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] @@ -1081,17 +1082,17 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h10, v0.h[1] -; CHECK-GI-NEXT: mov h11, v0.h[2] -; CHECK-GI-NEXT: mov h12, v0.h[3] -; CHECK-GI-NEXT: mov h13, v0.h[4] +; CHECK-GI-NEXT: mov h11, v0.h[1] +; CHECK-GI-NEXT: mov h12, v0.h[2] +; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h14, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h15, v1.h[3] -; CHECK-GI-NEXT: mov h14, v1.h[4] +; CHECK-GI-NEXT: mov h10, v1.h[3] +; CHECK-GI-NEXT: mov h15, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1103,27 +1104,27 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1135,10 +1136,10 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1151,7 +1152,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1166,7 +1167,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1369,27 +1370,27 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h1, v0.h[4] +; CHECK-GI-NEXT: mov h12, v0.h[1] +; CHECK-GI-NEXT: mov h13, v0.h[2] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h14, v0.h[3] ; CHECK-GI-NEXT: mov h15, v2.h[1] ; CHECK-GI-NEXT: mov h8, v2.h[2] ; CHECK-GI-NEXT: mov h9, v2.h[3] ; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h14, v2.h[5] -; CHECK-GI-NEXT: str h4, [sp, #288] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[5] -; CHECK-GI-NEXT: str h4, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[6] -; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[7] +; CHECK-GI-NEXT: mov h11, v2.h[5] +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[5] +; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[6] +; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h4, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[1] ; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] ; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill @@ -1400,7 +1401,7 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v4.h[5] ; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #320] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] @@ -1418,40 +1419,40 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v3.h[5] ; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #222] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[7] -; CHECK-GI-NEXT: str h1, [sp, #286] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #288] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #288] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1519,11 +1520,11 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #320] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #222] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1531,47 +1532,46 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #286] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q3, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 ; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #448 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index e2cd3835e4499..ed0d0d5194835 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -272,22 +272,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h9 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h10 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload -; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; GISEL-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; GISEL-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload -; GISEL-NEXT: mov v1.h[1], v2.h[0] -; GISEL-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload ; GISEL-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload +; GISEL-NEXT: mov v1.h[1], v2.h[0] +; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; GISEL-NEXT: mov v1.h[2], v2.h[0] ; GISEL-NEXT: mov v1.h[3], v0.h[0] ; GISEL-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 40a8128857cb7..74048b8bee332 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -671,152 +671,152 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ldr b3, [sp, #80] ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ldr b4, [sp, #528] -; CHECK-NEXT: ldr b6, [sp, #656] ; CHECK-NEXT: add x10, sp, #88 ; CHECK-NEXT: ld1 { v2.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #536 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ldr b5, [sp, #336] -; CHECK-NEXT: ldr b7, [sp, #464] -; CHECK-NEXT: add x12, sp, #664 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ld1 { v3.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #344 -; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ld1 { v4.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #176 +; CHECK-NEXT: ldr b6, [sp, #656] +; CHECK-NEXT: mov v0.b[2], w2 +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: ldr b7, [sp, #464] ; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x12, sp, #664 ; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: add x10, sp, #184 ; CHECK-NEXT: add x12, sp, #288 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v2.b }[2], [x12] -; CHECK-NEXT: add x8, sp, #96 -; CHECK-NEXT: add x13, sp, #544 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: mov v0.b[3], w3 ; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #672 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #352 -; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: ld1 { v6.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #480 +; CHECK-NEXT: ld1 { v2.b }[2], [x12] +; CHECK-NEXT: add x13, sp, #544 ; CHECK-NEXT: ld1 { v5.b }[2], [x8] -; CHECK-NEXT: ld1 { v7.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #552 +; CHECK-NEXT: add x8, sp, #672 +; CHECK-NEXT: ld1 { v4.b }[2], [x13] ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: add x15, sp, #104 -; CHECK-NEXT: ld1 { v4.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #360 +; CHECK-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #480 +; CHECK-NEXT: mov v0.b[4], w4 +; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #296 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #552 ; CHECK-NEXT: add x12, sp, #200 -; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #360 +; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: add x9, sp, #560 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v3.b }[3], [x15] -; CHECK-NEXT: add x15, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v1.b }[7], [x12] ; CHECK-NEXT: ld1 { v4.b }[4], [x9] ; CHECK-NEXT: add x13, sp, #208 -; CHECK-NEXT: add x8, sp, #216 -; CHECK-NEXT: ld1 { v5.b }[4], [x15] -; CHECK-NEXT: ld1 { v1.b }[7], [x12] -; CHECK-NEXT: add x12, sp, #568 -; CHECK-NEXT: add x14, sp, #224 -; CHECK-NEXT: add x16, sp, #304 -; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v4.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #376 -; CHECK-NEXT: ld1 { v5.b }[5], [x12] +; CHECK-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-NEXT: add x12, sp, #304 +; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: ld1 { v2.b }[4], [x12] ; CHECK-NEXT: add x12, sp, #16 +; CHECK-NEXT: add x17, sp, #376 +; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[8], [x13] +; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: add x14, sp, #216 +; CHECK-NEXT: ld1 { v5.b }[5], [x17] ; CHECK-NEXT: add x13, sp, #576 -; CHECK-NEXT: ld1 { v2.b }[4], [x16] -; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: add x11, sp, #224 +; CHECK-NEXT: add x10, sp, #232 +; CHECK-NEXT: add x15, sp, #240 +; CHECK-NEXT: ld1 { v1.b }[9], [x14] ; CHECK-NEXT: ld1 { v4.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #384 -; CHECK-NEXT: add x9, sp, #248 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ld1 { v1.b }[9], [x8] ; CHECK-NEXT: ld1 { v5.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #112 -; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: add x15, sp, #256 ; CHECK-NEXT: ld1 { v3.b }[4], [x13] ; CHECK-NEXT: add x13, sp, #32 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] -; CHECK-NEXT: ld1 { v1.b }[10], [x14] -; CHECK-NEXT: add x14, sp, #312 -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: add x14, sp, #584 +; CHECK-NEXT: ld1 { v1.b }[10], [x11] +; CHECK-NEXT: ld1 { v4.b }[7], [x14] +; CHECK-NEXT: add x11, sp, #312 +; CHECK-NEXT: add x14, sp, #40 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #592 ; CHECK-NEXT: ld1 { v0.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #24 -; CHECK-NEXT: ld1 { v2.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #592 -; CHECK-NEXT: add x16, sp, #264 -; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: add x16, sp, #248 ; CHECK-NEXT: ld1 { v1.b }[11], [x10] -; CHECK-NEXT: ld1 { v4.b }[8], [x14] -; CHECK-NEXT: add x14, sp, #400 +; CHECK-NEXT: ld1 { v4.b }[8], [x11] +; CHECK-NEXT: add x11, sp, #400 +; CHECK-NEXT: add x9, sp, #256 +; CHECK-NEXT: add x8, sp, #264 +; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v0.b }[9], [x12] ; CHECK-NEXT: add x12, sp, #392 -; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: movi v16.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v5.b }[7], [x12] ; CHECK-NEXT: add x12, sp, #48 +; CHECK-NEXT: ld1 { v1.b }[12], [x15] +; CHECK-NEXT: add x15, sp, #120 ; CHECK-NEXT: movi v17.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v1.b }[12], [x11] -; CHECK-NEXT: add x11, sp, #120 ; CHECK-NEXT: movi v18.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v0.b }[10], [x13] -; CHECK-NEXT: ld1 { v3.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #408 -; CHECK-NEXT: ld1 { v5.b }[8], [x14] +; CHECK-NEXT: ld1 { v3.b }[5], [x15] +; CHECK-NEXT: add x15, sp, #408 +; CHECK-NEXT: ld1 { v5.b }[8], [x11] ; CHECK-NEXT: add x13, sp, #56 -; CHECK-NEXT: add x14, sp, #64 -; CHECK-NEXT: ld1 { v1.b }[13], [x9] -; CHECK-NEXT: add x9, sp, #616 +; CHECK-NEXT: ld1 { v1.b }[13], [x16] +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: add x16, sp, #616 ; CHECK-NEXT: movi v19.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v0.b }[11], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ld1 { v4.b }[9], [x8] -; CHECK-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-NEXT: add x11, sp, #608 -; CHECK-NEXT: ld1 { v1.b }[14], [x15] -; CHECK-NEXT: add x15, sp, #488 -; CHECK-NEXT: add x8, sp, #320 +; CHECK-NEXT: ld1 { v0.b }[11], [x14] +; CHECK-NEXT: add x14, sp, #600 +; CHECK-NEXT: ld1 { v4.b }[9], [x14] +; CHECK-NEXT: ld1 { v5.b }[9], [x15] +; CHECK-NEXT: add x15, sp, #608 +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: add x14, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-NEXT: ld1 { v7.b }[3], [x15] -; CHECK-NEXT: ld1 { v2.b }[6], [x8] -; CHECK-NEXT: ld1 { v4.b }[10], [x11] -; CHECK-NEXT: add x8, sp, #624 +; CHECK-NEXT: ld1 { v7.b }[3], [x9] +; CHECK-NEXT: ld1 { v2.b }[6], [x14] +; CHECK-NEXT: ld1 { v4.b }[10], [x15] +; CHECK-NEXT: add x14, sp, #624 +; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v1.b }[15], [x8] +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: add x11, sp, #128 -; CHECK-NEXT: ld1 { v1.b }[15], [x16] ; CHECK-NEXT: ld1 { v0.b }[13], [x13] ; CHECK-NEXT: add x13, sp, #416 -; CHECK-NEXT: ld1 { v3.b }[6], [x11] +; CHECK-NEXT: ld1 { v2.b }[7], [x12] ; CHECK-NEXT: ld1 { v5.b }[10], [x13] -; CHECK-NEXT: ld1 { v4.b }[11], [x9] -; CHECK-NEXT: add x9, sp, #680 -; CHECK-NEXT: ld1 { v6.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v4.b }[11], [x16] +; CHECK-NEXT: add x16, sp, #680 +; CHECK-NEXT: ld1 { v6.b }[3], [x16] ; CHECK-NEXT: add x13, sp, #632 -; CHECK-NEXT: ld1 { v0.b }[14], [x14] -; CHECK-NEXT: add x14, sp, #424 -; CHECK-NEXT: ld1 { v2.b }[7], [x12] -; CHECK-NEXT: ld1 { v5.b }[11], [x14] -; CHECK-NEXT: ld1 { v4.b }[12], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: ld1 { v6.b }[4], [x9] -; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: add x12, sp, #504 +; CHECK-NEXT: ld1 { v0.b }[14], [x11] +; CHECK-NEXT: add x11, sp, #424 +; CHECK-NEXT: add x15, sp, #128 +; CHECK-NEXT: ld1 { v5.b }[11], [x11] +; CHECK-NEXT: ld1 { v4.b }[12], [x14] +; CHECK-NEXT: add x11, sp, #696 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: ld1 { v3.b }[6], [x15] +; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v0.b }[15], [x10] ; CHECK-NEXT: add x10, sp, #496 -; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v5.b }[12], [x8] ; CHECK-NEXT: ld1 { v7.b }[4], [x10] ; CHECK-NEXT: ld1 { v4.b }[13], [x13] @@ -1105,220 +1105,220 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: fmov s3, w0 +; CHECK-NEXT: fmov s4, w0 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b1, [sp, #16] ; CHECK-NEXT: add x10, sp, #24 ; CHECK-NEXT: ldr b2, [sp, #280] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: ldr b5, [sp, #152] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: mov v3.b[1], w1 +; CHECK-NEXT: ldr b3, [sp, #216] +; CHECK-NEXT: add x11, sp, #224 +; CHECK-NEXT: mov v4.b[1], w1 ; CHECK-NEXT: ld1 { v1.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ldr b5, [sp, #152] +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: ldr b4, [sp, #216] ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: ld1 { v5.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: ld1 { v1.b }[2], [x10] -; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: mov v3.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: mov v4.b[2], w2 +; CHECK-NEXT: ld1 { v1.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x11] +; CHECK-NEXT: ld1 { v3.b }[2], [x10] ; CHECK-NEXT: add x11, sp, #168 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #240 -; CHECK-NEXT: add x15, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[4], [x13] -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: mov v4.b[3], w3 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v0.b }[4], [x12] +; CHECK-NEXT: add x12, sp, #112 +; CHECK-NEXT: add x13, sp, #48 +; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: ld1 { v0.b }[4], [x12] ; CHECK-NEXT: add x12, sp, #184 -; CHECK-NEXT: ld1 { v2.b }[3], [x11] -; CHECK-NEXT: mov v3.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x15] -; CHECK-NEXT: add x11, sp, #64 -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ldr b6, [sp, #352] +; CHECK-NEXT: ld1 { v1.b }[4], [x13] +; CHECK-NEXT: add x15, sp, #56 +; CHECK-NEXT: add x14, sp, #128 +; CHECK-NEXT: mov v4.b[4], w4 +; CHECK-NEXT: add x11, sp, #304 +; CHECK-NEXT: add x13, sp, #256 ; CHECK-NEXT: ld1 { v5.b }[4], [x12] ; CHECK-NEXT: ld1 { v0.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: mov v3.b[5], w5 -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ldr b16, [sp, #552] -; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: add x12, sp, #248 +; CHECK-NEXT: ld1 { v1.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #200 -; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: add x14, sp, #128 -; CHECK-NEXT: ld1 { v16.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[4], [x12] +; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: mov v4.b[5], w5 +; CHECK-NEXT: ld1 { v5.b }[5], [x9] ; CHECK-NEXT: ld1 { v0.b }[6], [x14] -; CHECK-NEXT: mov v3.b[6], w6 +; CHECK-NEXT: ldr b6, [sp, #352] +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #360 +; CHECK-NEXT: ld1 { v3.b }[5], [x13] +; CHECK-NEXT: ldr b18, [sp, #552] ; CHECK-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: add x14, sp, #208 -; CHECK-NEXT: ldr b18, [sp, #480] -; CHECK-NEXT: ld1 { v16.b }[2], [x8] -; CHECK-NEXT: ldr b7, [sp, #144] -; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: mov v4.b[6], w6 +; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: add x9, sp, #264 +; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #568 ; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: add x8, sp, #376 -; CHECK-NEXT: ld1 { v18.b }[1], [x11] -; CHECK-NEXT: mov v3.b[7], w7 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #576 +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ldr b7, [sp, #144] +; CHECK-NEXT: mov v4.b[7], w7 +; CHECK-NEXT: ld1 { v18.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #376 +; CHECK-NEXT: sshll v17.8h, v5.8b, #0 +; CHECK-NEXT: ldr b5, [sp, #480] ; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] +; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #576 +; CHECK-NEXT: add x8, sp, #312 +; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: ld1 { v18.b }[3], [x10] ; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: add x8, sp, #384 -; CHECK-NEXT: ld1 { v18.b }[2], [x11] -; CHECK-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #584 +; CHECK-NEXT: sshll v16.8h, v4.8b, #0 +; CHECK-NEXT: ldr b4, [sp, #344] +; CHECK-NEXT: add x10, sp, #384 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #584 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: sshll v19.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v18.b }[4], [x10] +; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h +; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h +; CHECK-NEXT: ldr b17, [sp, #416] ; CHECK-NEXT: add x11, sp, #504 -; CHECK-NEXT: sshll v17.8h, v3.8b, #0 -; CHECK-NEXT: ldr b3, [sp, #344] -; CHECK-NEXT: ld1 { v16.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #424 +; CHECK-NEXT: add x10, sp, #424 ; CHECK-NEXT: add x16, sp, #320 -; CHECK-NEXT: ld1 { v18.b }[3], [x11] -; CHECK-NEXT: sshll v19.8h, v3.8b, #0 +; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #392 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #592 ; CHECK-NEXT: ld1 { v2.b }[5], [x16] -; CHECK-NEXT: smull2 v3.4s, v17.8h, v5.8h -; CHECK-NEXT: smull v5.4s, v17.4h, v5.4h -; CHECK-NEXT: movi v17.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: add x12, sp, #248 +; CHECK-NEXT: ld1 { v18.b }[5], [x10] ; CHECK-NEXT: add x11, sp, #512 -; CHECK-NEXT: smull v7.4s, v7.4h, v19.4h -; CHECK-NEXT: ldr b19, [sp, #416] -; CHECK-NEXT: ld1 { v4.b }[4], [x12] +; CHECK-NEXT: add x10, sp, #432 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: ld1 { v18.b }[4], [x11] +; CHECK-NEXT: mov v7.s[0], v19.s[0] +; CHECK-NEXT: ld1 { v5.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v19.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #592 +; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #600 +; CHECK-NEXT: ldr b19, [sp, #680] ; CHECK-NEXT: ldr b20, [sp, #616] -; CHECK-NEXT: ld1 { v16.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: mov v17.s[0], v7.s[0] -; CHECK-NEXT: ldr b7, [sp, #680] ; CHECK-NEXT: ld1 { v6.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: ld1 { v18.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[6], [x8] ; CHECK-NEXT: add x12, sp, #624 -; CHECK-NEXT: ld1 { v7.b }[1], [x11] +; CHECK-NEXT: ld1 { v19.b }[1], [x11] ; CHECK-NEXT: ld1 { v20.b }[1], [x12] -; CHECK-NEXT: add x8, sp, #408 +; CHECK-NEXT: add x10, sp, #408 ; CHECK-NEXT: add x11, sp, #608 ; CHECK-NEXT: add x12, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #696 -; CHECK-NEXT: ld1 { v16.b }[7], [x11] -; CHECK-NEXT: ld1 { v19.b }[3], [x12] +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: ld1 { v18.b }[7], [x11] +; CHECK-NEXT: ld1 { v17.b }[3], [x12] +; CHECK-NEXT: add x10, sp, #696 ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: ld1 { v19.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #448 ; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x8, sp, #448 ; CHECK-NEXT: add x11, sp, #640 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: add x13, sp, #256 -; CHECK-NEXT: ld1 { v19.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #704 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v19.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: add x12, sp, #520 ; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x8, sp, #712 ; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: add x12, sp, #520 -; CHECK-NEXT: ld1 { v4.b }[5], [x13] ; CHECK-NEXT: ldr b21, [sp, #544] -; CHECK-NEXT: smull2 v22.4s, v6.8h, v16.8h -; CHECK-NEXT: smull v6.4s, v6.4h, v16.4h -; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h +; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h +; CHECK-NEXT: ldr b18, [sp, #744] +; CHECK-NEXT: ld1 { v19.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[5], [x12] +; CHECK-NEXT: add x12, sp, #656 ; CHECK-NEXT: ld1 { v20.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #456 -; CHECK-NEXT: ldr b16, [sp, #744] -; CHECK-NEXT: ld1 { v18.b }[5], [x12] -; CHECK-NEXT: ld1 { v19.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #720 -; CHECK-NEXT: add x12, sp, #656 -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v7.b }[5], [x11] -; CHECK-NEXT: ld1 { v20.b }[5], [x12] ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-NEXT: ld1 { v17.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v19.b }[5], [x11] +; CHECK-NEXT: add x10, sp, #528 ; CHECK-NEXT: add x11, sp, #464 +; CHECK-NEXT: ld1 { v20.b }[5], [x12] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] ; CHECK-NEXT: add x12, sp, #728 ; CHECK-NEXT: add x13, sp, #664 -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v19.b }[6], [x11] -; CHECK-NEXT: ld1 { v7.b }[6], [x12] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: ld1 { v19.b }[6], [x12] +; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #336 ; CHECK-NEXT: ld1 { v20.b }[6], [x13] -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #336 ; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: smull v16.4s, v21.4h, v16.4h +; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h ; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ld1 { v2.b }[7], [x10] -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #472 ; CHECK-NEXT: add x9, sp, #736 ; CHECK-NEXT: add x10, sp, #672 -; CHECK-NEXT: ld1 { v19.b }[7], [x8] -; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: ld1 { v19.b }[7], [x9] ; CHECK-NEXT: ld1 { v20.b }[7], [x10] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v21.s[0], v16.s[0] +; CHECK-NEXT: mov v21.s[0], v18.s[0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v16.8h, v18.8b, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 ; CHECK-NEXT: sshll v19.8h, v20.8b, #0 -; CHECK-NEXT: smlal v5.4s, v0.4h, v2.4h -; CHECK-NEXT: smlal2 v3.4s, v0.8h, v2.8h -; CHECK-NEXT: smlal v17.4s, v1.4h, v4.4h -; CHECK-NEXT: smlal v6.4s, v16.4h, v7.4h -; CHECK-NEXT: smlal2 v22.4s, v16.8h, v7.8h -; CHECK-NEXT: smlal v21.4s, v18.4h, v19.4h -; CHECK-NEXT: smlal2 v3.4s, v1.8h, v4.8h -; CHECK-NEXT: add v0.4s, v5.4s, v17.4s +; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h +; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h +; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h +; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h +; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h +; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h +; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h +; CHECK-NEXT: add v0.4s, v16.4s, v7.4s ; CHECK-NEXT: add v1.4s, v6.4s, v21.4s -; CHECK-NEXT: smlal2 v22.4s, v18.8h, v19.8h -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s ; CHECK-NEXT: add v1.4s, v1.4s, v22.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1860,10 +1860,10 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: sshll v23.8h, v16.8b, #0 ; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ldr b24, [sp, #872] +; CHECK-NEXT: movi v19.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #528 -; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: add x10, sp, #464 ; CHECK-NEXT: ld1 { v4.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: smull2 v18.4s, v20.8h, v23.8h @@ -1878,13 +1878,13 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: ldr b23, [sp, #1000] ; CHECK-NEXT: ld1 { v7.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #688 -; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v24.8h, v22.8b, #0 ; CHECK-NEXT: ld1 { v21.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: sshll v23.8h, v23.8b, #0 +; CHECK-NEXT: sshll v25.8h, v23.8b, #0 ; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ldr b25, [sp, #936] -; CHECK-NEXT: add x10, sp, #464 +; CHECK-NEXT: ldr b22, [sp, #872] +; CHECK-NEXT: ldr b23, [sp, #936] ; CHECK-NEXT: ld1 { v4.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #584 ; CHECK-NEXT: ld1 { v17.b }[7], [x10] @@ -1892,110 +1892,110 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #880 ; CHECK-NEXT: add x9, sp, #704 -; CHECK-NEXT: smull v22.4s, v22.4h, v23.4h -; CHECK-NEXT: ldr b23, [sp, #744] -; CHECK-NEXT: ld1 { v24.b }[1], [x8] +; CHECK-NEXT: smull v25.4s, v24.4h, v25.4h +; CHECK-NEXT: ldr b24, [sp, #744] +; CHECK-NEXT: ld1 { v22.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #944 ; CHECK-NEXT: add x10, sp, #888 ; CHECK-NEXT: ld1 { v21.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #752 -; CHECK-NEXT: ld1 { v25.b }[1], [x8] -; CHECK-NEXT: ld1 { v23.b }[1], [x9] +; CHECK-NEXT: ld1 { v23.b }[1], [x8] +; CHECK-NEXT: ld1 { v24.b }[1], [x9] ; CHECK-NEXT: add x8, sp, #712 ; CHECK-NEXT: add x9, sp, #760 -; CHECK-NEXT: ld1 { v24.b }[2], [x10] +; CHECK-NEXT: ld1 { v22.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #952 -; CHECK-NEXT: mov v19.s[0], v22.s[0] -; CHECK-NEXT: ldr b22, [sp, #808] -; CHECK-NEXT: ld1 { v25.b }[2], [x10] +; CHECK-NEXT: mov v19.s[0], v25.s[0] +; CHECK-NEXT: ldr b25, [sp, #808] +; CHECK-NEXT: ld1 { v23.b }[2], [x10] ; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: ld1 { v23.b }[2], [x9] +; CHECK-NEXT: ld1 { v24.b }[2], [x9] ; CHECK-NEXT: add x8, sp, #816 ; CHECK-NEXT: add x9, sp, #896 -; CHECK-NEXT: ld1 { v22.b }[1], [x8] +; CHECK-NEXT: ld1 { v25.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #960 -; CHECK-NEXT: ld1 { v24.b }[3], [x9] +; CHECK-NEXT: ld1 { v22.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #768 -; CHECK-NEXT: ld1 { v25.b }[3], [x8] +; CHECK-NEXT: ld1 { v23.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #904 -; CHECK-NEXT: ld1 { v23.b }[3], [x9] +; CHECK-NEXT: ld1 { v24.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #824 ; CHECK-NEXT: add x8, sp, #720 -; CHECK-NEXT: ld1 { v22.b }[2], [x9] +; CHECK-NEXT: ld1 { v25.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: ld1 { v24.b }[4], [x10] +; CHECK-NEXT: ld1 { v22.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #776 -; CHECK-NEXT: ld1 { v25.b }[4], [x9] +; CHECK-NEXT: ld1 { v23.b }[4], [x9] ; CHECK-NEXT: ld1 { v21.b }[6], [x8] -; CHECK-NEXT: ld1 { v23.b }[4], [x10] +; CHECK-NEXT: ld1 { v24.b }[4], [x10] ; CHECK-NEXT: add x8, sp, #832 ; CHECK-NEXT: add x9, sp, #912 -; CHECK-NEXT: ld1 { v22.b }[3], [x8] +; CHECK-NEXT: ld1 { v25.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #976 -; CHECK-NEXT: ld1 { v24.b }[5], [x9] +; CHECK-NEXT: ld1 { v22.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #784 -; CHECK-NEXT: ld1 { v25.b }[5], [x8] +; CHECK-NEXT: ld1 { v23.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #920 -; CHECK-NEXT: ld1 { v23.b }[5], [x9] +; CHECK-NEXT: ld1 { v24.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #840 ; CHECK-NEXT: add x8, sp, #728 -; CHECK-NEXT: ld1 { v22.b }[4], [x9] +; CHECK-NEXT: ld1 { v25.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #984 -; CHECK-NEXT: ld1 { v24.b }[6], [x10] +; CHECK-NEXT: ld1 { v22.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #792 -; CHECK-NEXT: ld1 { v25.b }[6], [x9] +; CHECK-NEXT: ld1 { v23.b }[6], [x9] ; CHECK-NEXT: ld1 { v21.b }[7], [x8] -; CHECK-NEXT: ld1 { v23.b }[6], [x10] +; CHECK-NEXT: ld1 { v24.b }[6], [x10] ; CHECK-NEXT: add x8, sp, #848 ; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v22.b }[5], [x8] +; CHECK-NEXT: ld1 { v25.b }[5], [x8] ; CHECK-NEXT: add x12, sp, #72 ; CHECK-NEXT: add x8, sp, #992 -; CHECK-NEXT: ld1 { v24.b }[7], [x9] +; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #800 ; CHECK-NEXT: ld1 { v3.b }[7], [x12] -; CHECK-NEXT: ld1 { v25.b }[7], [x8] +; CHECK-NEXT: ld1 { v23.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v23.b }[7], [x9] +; CHECK-NEXT: ld1 { v24.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #856 ; CHECK-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-NEXT: add x11, sp, #200 -; CHECK-NEXT: ld1 { v22.b }[6], [x9] +; CHECK-NEXT: ld1 { v25.b }[6], [x9] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: sshll v24.8h, v24.8b, #0 -; CHECK-NEXT: sshll v25.8h, v25.8b, #0 -; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: sshll v22.8h, v22.8b, #0 ; CHECK-NEXT: sshll v23.8h, v23.8b, #0 +; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: sshll v24.8h, v24.8b, #0 ; CHECK-NEXT: add x9, sp, #864 ; CHECK-NEXT: ld1 { v2.b }[7], [x11] ; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: ld1 { v22.b }[7], [x9] +; CHECK-NEXT: ld1 { v25.b }[7], [x9] ; CHECK-NEXT: smull v16.4s, v3.4h, v5.4h ; CHECK-NEXT: smull2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: smull v5.4s, v21.4h, v25.4h -; CHECK-NEXT: smull2 v21.4s, v21.8h, v25.8h -; CHECK-NEXT: smull2 v25.4s, v20.8h, v24.8h -; CHECK-NEXT: smlal v19.4s, v4.4h, v23.4h +; CHECK-NEXT: smull v5.4s, v21.4h, v23.4h +; CHECK-NEXT: smull2 v21.4s, v21.8h, v23.8h +; CHECK-NEXT: smull2 v23.4s, v20.8h, v22.8h +; CHECK-NEXT: smlal v19.4s, v4.4h, v24.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v25.8h, v25.8b, #0 ; CHECK-NEXT: smlal2 v3.4s, v2.8h, v17.8h ; CHECK-NEXT: smlal v16.4s, v2.4h, v17.4h -; CHECK-NEXT: smlal2 v25.4s, v4.8h, v23.8h +; CHECK-NEXT: smlal2 v23.4s, v4.8h, v24.8h ; CHECK-NEXT: smlal2 v18.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v19.4s, v20.4h, v24.4h -; CHECK-NEXT: smlal2 v21.4s, v7.8h, v22.8h -; CHECK-NEXT: smlal v5.4s, v7.4h, v22.4h +; CHECK-NEXT: smlal v19.4s, v20.4h, v22.4h +; CHECK-NEXT: smlal2 v21.4s, v7.8h, v25.8h +; CHECK-NEXT: smlal v5.4s, v7.4h, v25.4h ; CHECK-NEXT: add v0.4s, v18.4s, v3.4s ; CHECK-NEXT: add v1.4s, v6.4s, v16.4s -; CHECK-NEXT: add v2.4s, v25.4s, v21.4s +; CHECK-NEXT: add v2.4s, v23.4s, v21.4s ; CHECK-NEXT: add v3.4s, v19.4s, v5.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s @@ -2267,14 +2267,14 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ldr b3, [sp, #592] ; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ldr b4, [sp, #208] +; CHECK-NEXT: ldr b6, [sp, #208] ; CHECK-NEXT: ldr b0, [sp, #336] ; CHECK-NEXT: add x9, sp, #344 ; CHECK-NEXT: ldr b2, [sp, #464] ; CHECK-NEXT: ld1 { v3.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #216 ; CHECK-NEXT: add x10, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-NEXT: ld1 { v6.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #608 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #232 @@ -2282,17 +2282,17 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: ldr b7, [sp, #1360] ; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: add x12, sp, #376 -; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v6.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: add x11, sp, #656 +; CHECK-NEXT: add x12, sp, #376 ; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: ldr b17, [sp, #976] +; CHECK-NEXT: ldr b16, [sp, #976] ; CHECK-NEXT: add x14, sp, #288 ; CHECK-NEXT: ld1 { v3.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #632 ; CHECK-NEXT: add x15, sp, #408 -; CHECK-NEXT: ld1 { v4.b }[3], [x9] +; CHECK-NEXT: ld1 { v6.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: add x13, sp, #696 ; CHECK-NEXT: ld1 { v2.b }[1], [x9] @@ -2301,344 +2301,344 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: ld1 { v3.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #352 ; CHECK-NEXT: mov v1.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[4], [x9] ; CHECK-NEXT: ld1 { v0.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #1368 ; CHECK-NEXT: ld1 { v7.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #248 ; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #648 -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v6.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #360 ; CHECK-NEXT: mov v1.b[3], w3 ; CHECK-NEXT: ld1 { v0.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #256 -; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v3.b }[6], [x9] ; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ldr b16, [sp, #720] -; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: ldr b17, [sp, #720] +; CHECK-NEXT: ld1 { v6.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #984 ; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #664 -; CHECK-NEXT: ld1 { v3.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #264 +; CHECK-NEXT: ld1 { v3.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #264 ; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v6.b }[7], [x11] ; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: add x8, sp, #680 +; CHECK-NEXT: add x11, sp, #680 ; CHECK-NEXT: ld1 { v0.b }[5], [x12] ; CHECK-NEXT: add x12, sp, #480 ; CHECK-NEXT: ld1 { v2.b }[2], [x12] ; CHECK-NEXT: add x12, sp, #272 -; CHECK-NEXT: ld1 { v3.b }[8], [x11] -; CHECK-NEXT: ld1 { v4.b }[8], [x12] +; CHECK-NEXT: ld1 { v3.b }[8], [x8] +; CHECK-NEXT: ld1 { v6.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #384 ; CHECK-NEXT: mov v1.b[5], w5 ; CHECK-NEXT: ld1 { v0.b }[6], [x12] ; CHECK-NEXT: add x12, sp, #280 -; CHECK-NEXT: add x11, sp, #688 +; CHECK-NEXT: add x8, sp, #688 ; CHECK-NEXT: ld1 { v3.b }[9], [x10] ; CHECK-NEXT: add x10, sp, #1376 ; CHECK-NEXT: ld1 { v7.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #392 -; CHECK-NEXT: ld1 { v4.b }[9], [x12] +; CHECK-NEXT: ld1 { v6.b }[9], [x12] ; CHECK-NEXT: ld1 { v0.b }[7], [x10] ; CHECK-NEXT: mov v1.b[6], w6 ; CHECK-NEXT: add x12, sp, #704 ; CHECK-NEXT: ld1 { v3.b }[10], [x9] ; CHECK-NEXT: add x9, sp, #400 ; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: ld1 { v4.b }[10], [x14] +; CHECK-NEXT: ld1 { v6.b }[10], [x14] ; CHECK-NEXT: add x14, sp, #992 ; CHECK-NEXT: ld1 { v0.b }[8], [x9] -; CHECK-NEXT: ld1 { v17.b }[2], [x14] +; CHECK-NEXT: ld1 { v16.b }[2], [x14] ; CHECK-NEXT: add x14, sp, #296 -; CHECK-NEXT: ld1 { v3.b }[11], [x8] +; CHECK-NEXT: ld1 { v3.b }[11], [x11] ; CHECK-NEXT: add x9, sp, #304 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v4.b }[11], [x14] +; CHECK-NEXT: add x11, sp, #312 +; CHECK-NEXT: ld1 { v6.b }[11], [x14] ; CHECK-NEXT: mov v1.b[7], w7 ; CHECK-NEXT: add x14, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[9], [x15] ; CHECK-NEXT: add x15, sp, #328 -; CHECK-NEXT: ld1 { v3.b }[12], [x11] -; CHECK-NEXT: add x11, sp, #416 -; CHECK-NEXT: ld1 { v4.b }[12], [x9] +; CHECK-NEXT: ld1 { v3.b }[12], [x8] +; CHECK-NEXT: add x8, sp, #416 +; CHECK-NEXT: ld1 { v6.b }[12], [x9] ; CHECK-NEXT: add x9, sp, #1384 -; CHECK-NEXT: ld1 { v0.b }[10], [x11] +; CHECK-NEXT: ld1 { v0.b }[10], [x8] ; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #424 ; CHECK-NEXT: ld1 { v3.b }[13], [x13] -; CHECK-NEXT: add x11, sp, #432 +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: add x13, sp, #440 -; CHECK-NEXT: ld1 { v4.b }[13], [x8] -; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ld1 { v6.b }[13], [x11] +; CHECK-NEXT: add x11, sp, #16 ; CHECK-NEXT: ld1 { v0.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #1000 -; CHECK-NEXT: ld1 { v1.b }[8], [x8] -; CHECK-NEXT: ld1 { v17.b }[3], [x9] +; CHECK-NEXT: ld1 { v1.b }[8], [x11] +; CHECK-NEXT: ld1 { v16.b }[3], [x9] ; CHECK-NEXT: ld1 { v3.b }[14], [x12] ; CHECK-NEXT: add x12, sp, #488 -; CHECK-NEXT: ld1 { v4.b }[14], [x14] +; CHECK-NEXT: ld1 { v6.b }[14], [x14] ; CHECK-NEXT: add x14, sp, #1392 ; CHECK-NEXT: ld1 { v2.b }[3], [x12] ; CHECK-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-NEXT: add x8, sp, #1008 -; CHECK-NEXT: ld1 { v0.b }[12], [x11] -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x11, sp, #1400 -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #1016 -; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: ld1 { v7.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #1008 +; CHECK-NEXT: ld1 { v0.b }[12], [x8] +; CHECK-NEXT: ld1 { v16.b }[4], [x11] +; CHECK-NEXT: add x8, sp, #1400 ; CHECK-NEXT: ld1 { v3.b }[15], [x10] +; CHECK-NEXT: add x10, sp, #496 +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ld1 { v6.b }[15], [x15] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] +; CHECK-NEXT: ld1 { v2.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #1016 +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: ld1 { v0.b }[13], [x13] -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x10, sp, #1408 +; CHECK-NEXT: add x8, sp, #1408 ; CHECK-NEXT: ld1 { v1.b }[9], [x9] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ld1 { v4.b }[15], [x15] -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #1024 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] -; CHECK-NEXT: ld1 { v0.b }[14], [x16] -; CHECK-NEXT: ld1 { v1.b }[10], [x9] -; CHECK-NEXT: add x9, sp, #1416 +; CHECK-NEXT: add x9, sp, #504 ; CHECK-NEXT: add x10, sp, #512 -; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #1024 +; CHECK-NEXT: add x8, sp, #32 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: ld1 { v0.b }[14], [x16] +; CHECK-NEXT: ld1 { v1.b }[10], [x8] +; CHECK-NEXT: add x8, sp, #1416 +; CHECK-NEXT: add x9, sp, #456 +; CHECK-NEXT: ld1 { v7.b }[7], [x8] ; CHECK-NEXT: ld1 { v2.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #1032 -; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v0.b }[15], [x8] -; CHECK-NEXT: ld1 { v1.b }[11], [x9] -; CHECK-NEXT: add x9, sp, #1424 -; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ld1 { v7.b }[8], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #1040 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: ld1 { v17.b }[8], [x8] +; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: ld1 { v16.b }[7], [x10] +; CHECK-NEXT: ld1 { v0.b }[15], [x9] +; CHECK-NEXT: ld1 { v1.b }[11], [x8] +; CHECK-NEXT: add x8, sp, #1424 +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v7.b }[8], [x8] +; CHECK-NEXT: ld1 { v2.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #1040 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v16.b }[8], [x9] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v1.b }[12], [x9] -; CHECK-NEXT: add x9, sp, #1432 -; CHECK-NEXT: sdot v6.4s, v4.16b, v3.16b -; CHECK-NEXT: ld1 { v7.b }[9], [x9] +; CHECK-NEXT: ld1 { v1.b }[12], [x8] +; CHECK-NEXT: add x8, sp, #1432 +; CHECK-NEXT: sdot v5.4s, v6.16b, v3.16b +; CHECK-NEXT: ld1 { v7.b }[9], [x8] ; CHECK-NEXT: ld1 { v2.b }[8], [x10] -; CHECK-NEXT: add x9, sp, #1048 +; CHECK-NEXT: add x8, sp, #1048 ; CHECK-NEXT: ldr b3, [sp, #80] -; CHECK-NEXT: ld1 { v17.b }[9], [x9] -; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ld1 { v16.b }[9], [x8] ; CHECK-NEXT: add x10, sp, #88 -; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: add x8, sp, #536 ; CHECK-NEXT: add x11, sp, #1440 +; CHECK-NEXT: add x9, sp, #56 ; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: ld1 { v1.b }[13], [x8] -; CHECK-NEXT: ld1 { v2.b }[9], [x9] +; CHECK-NEXT: ld1 { v2.b }[9], [x8] ; CHECK-NEXT: add x8, sp, #1056 ; CHECK-NEXT: ld1 { v7.b }[10], [x11] +; CHECK-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-NEXT: ld1 { v1.b }[13], [x9] ; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v17.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #544 ; CHECK-NEXT: add x10, sp, #1448 ; CHECK-NEXT: ld1 { v3.b }[2], [x9] ; CHECK-NEXT: ld1 { v2.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #1064 ; CHECK-NEXT: ld1 { v7.b }[11], [x10] +; CHECK-NEXT: ld1 { v16.b }[11], [x8] ; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: add x11, sp, #1456 -; CHECK-NEXT: ld1 { v17.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #552 +; CHECK-NEXT: add x11, sp, #1456 ; CHECK-NEXT: add x9, sp, #64 ; CHECK-NEXT: ld1 { v3.b }[3], [x10] ; CHECK-NEXT: ld1 { v2.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #1072 ; CHECK-NEXT: ld1 { v7.b }[12], [x11] +; CHECK-NEXT: ld1 { v16.b }[12], [x8] ; CHECK-NEXT: ld1 { v1.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: ld1 { v17.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #560 ; CHECK-NEXT: add x10, sp, #1464 ; CHECK-NEXT: ld1 { v3.b }[4], [x9] ; CHECK-NEXT: ld1 { v2.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #1080 ; CHECK-NEXT: ld1 { v7.b }[13], [x10] +; CHECK-NEXT: ld1 { v16.b }[13], [x8] ; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x11, sp, #1472 -; CHECK-NEXT: ld1 { v17.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: add x11, sp, #1472 ; CHECK-NEXT: add x9, sp, #72 ; CHECK-NEXT: ld1 { v3.b }[5], [x10] ; CHECK-NEXT: ld1 { v2.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #1088 ; CHECK-NEXT: ld1 { v7.b }[14], [x11] +; CHECK-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-NEXT: ld1 { v1.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v17.b }[14], [x8] -; CHECK-NEXT: ldr b4, [sp, #1104] +; CHECK-NEXT: ldr b6, [sp, #1104] ; CHECK-NEXT: add x10, sp, #1480 ; CHECK-NEXT: ld1 { v3.b }[6], [x9] ; CHECK-NEXT: add x8, sp, #1096 ; CHECK-NEXT: add x9, sp, #1112 ; CHECK-NEXT: ld1 { v7.b }[15], [x10] -; CHECK-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v17.b }[15], [x8] +; CHECK-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-NEXT: ld1 { v6.b }[1], [x9] ; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: add x9, sp, #576 ; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v16.b }[1], [x8] +; CHECK-NEXT: ld1 { v17.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #1120 ; CHECK-NEXT: ld1 { v2.b }[14], [x9] -; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b +; CHECK-NEXT: ld1 { v6.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #736 -; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: sdot v5.4s, v17.16b, v7.16b ; CHECK-NEXT: ldr b7, [sp, #1232] -; CHECK-NEXT: ldr b17, [sp, #848] -; CHECK-NEXT: ld1 { v16.b }[2], [x8] +; CHECK-NEXT: ldr b16, [sp, #848] +; CHECK-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x9, sp, #1240 ; CHECK-NEXT: add x10, sp, #856 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] ; CHECK-NEXT: add x8, sp, #1128 ; CHECK-NEXT: add x11, sp, #744 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: ld1 { v6.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #1248 -; CHECK-NEXT: ld1 { v16.b }[3], [x11] +; CHECK-NEXT: ld1 { v17.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #864 ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: ld1 { v7.b }[2], [x10] -; CHECK-NEXT: ld1 { v17.b }[2], [x11] +; CHECK-NEXT: ld1 { v16.b }[2], [x11] ; CHECK-NEXT: add x8, sp, #1136 ; CHECK-NEXT: add x12, sp, #752 ; CHECK-NEXT: ld1 { v3.b }[8], [x9] -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: ld1 { v16.b }[4], [x12] +; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: ld1 { v17.b }[4], [x12] ; CHECK-NEXT: add x9, sp, #1256 ; CHECK-NEXT: add x10, sp, #872 ; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: ld1 { v16.b }[3], [x10] ; CHECK-NEXT: add x8, sp, #1144 ; CHECK-NEXT: add x11, sp, #760 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: ld1 { v6.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #1264 -; CHECK-NEXT: ld1 { v16.b }[5], [x11] +; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #880 ; CHECK-NEXT: add x9, sp, #152 ; CHECK-NEXT: ld1 { v7.b }[4], [x10] -; CHECK-NEXT: ld1 { v17.b }[4], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x11] ; CHECK-NEXT: add x8, sp, #1152 ; CHECK-NEXT: add x12, sp, #768 ; CHECK-NEXT: ld1 { v3.b }[9], [x9] -; CHECK-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-NEXT: ld1 { v16.b }[6], [x12] +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: ld1 { v17.b }[6], [x12] ; CHECK-NEXT: add x9, sp, #1272 ; CHECK-NEXT: add x10, sp, #888 ; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: add x8, sp, #1160 ; CHECK-NEXT: add x11, sp, #776 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v6.b }[7], [x8] ; CHECK-NEXT: add x10, sp, #1280 -; CHECK-NEXT: ld1 { v16.b }[7], [x11] +; CHECK-NEXT: ld1 { v17.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #896 ; CHECK-NEXT: add x9, sp, #160 ; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: ld1 { v16.b }[6], [x11] ; CHECK-NEXT: add x8, sp, #1168 ; CHECK-NEXT: add x12, sp, #784 ; CHECK-NEXT: ld1 { v3.b }[10], [x9] -; CHECK-NEXT: ld1 { v4.b }[8], [x8] -; CHECK-NEXT: ld1 { v16.b }[8], [x12] +; CHECK-NEXT: ld1 { v6.b }[8], [x8] +; CHECK-NEXT: ld1 { v17.b }[8], [x12] ; CHECK-NEXT: add x9, sp, #1288 ; CHECK-NEXT: add x10, sp, #904 ; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-NEXT: ld1 { v16.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #1176 ; CHECK-NEXT: add x11, sp, #792 -; CHECK-NEXT: ld1 { v4.b }[9], [x8] +; CHECK-NEXT: ld1 { v6.b }[9], [x8] ; CHECK-NEXT: add x10, sp, #1296 -; CHECK-NEXT: ld1 { v16.b }[9], [x11] +; CHECK-NEXT: ld1 { v17.b }[9], [x11] ; CHECK-NEXT: add x11, sp, #912 ; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v7.b }[8], [x10] -; CHECK-NEXT: ld1 { v17.b }[8], [x11] +; CHECK-NEXT: ld1 { v16.b }[8], [x11] ; CHECK-NEXT: add x8, sp, #1184 ; CHECK-NEXT: add x12, sp, #800 ; CHECK-NEXT: ld1 { v3.b }[11], [x9] -; CHECK-NEXT: ld1 { v4.b }[10], [x8] -; CHECK-NEXT: ld1 { v16.b }[10], [x12] +; CHECK-NEXT: ld1 { v6.b }[10], [x8] +; CHECK-NEXT: ld1 { v17.b }[10], [x12] ; CHECK-NEXT: add x9, sp, #1304 ; CHECK-NEXT: add x10, sp, #920 ; CHECK-NEXT: ld1 { v7.b }[9], [x9] -; CHECK-NEXT: ld1 { v17.b }[9], [x10] +; CHECK-NEXT: ld1 { v16.b }[9], [x10] ; CHECK-NEXT: add x8, sp, #1192 ; CHECK-NEXT: add x11, sp, #808 -; CHECK-NEXT: ld1 { v4.b }[11], [x8] +; CHECK-NEXT: ld1 { v6.b }[11], [x8] ; CHECK-NEXT: add x10, sp, #1312 -; CHECK-NEXT: ld1 { v16.b }[11], [x11] +; CHECK-NEXT: ld1 { v17.b }[11], [x11] ; CHECK-NEXT: add x11, sp, #928 ; CHECK-NEXT: add x9, sp, #176 ; CHECK-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-NEXT: ld1 { v17.b }[10], [x11] +; CHECK-NEXT: ld1 { v16.b }[10], [x11] ; CHECK-NEXT: add x8, sp, #1200 ; CHECK-NEXT: add x12, sp, #816 ; CHECK-NEXT: ld1 { v3.b }[12], [x9] -; CHECK-NEXT: ld1 { v4.b }[12], [x8] -; CHECK-NEXT: ld1 { v16.b }[12], [x12] +; CHECK-NEXT: ld1 { v6.b }[12], [x8] +; CHECK-NEXT: ld1 { v17.b }[12], [x12] ; CHECK-NEXT: add x9, sp, #1320 ; CHECK-NEXT: add x10, sp, #936 ; CHECK-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-NEXT: ld1 { v17.b }[11], [x10] +; CHECK-NEXT: ld1 { v16.b }[11], [x10] ; CHECK-NEXT: add x8, sp, #1208 ; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v4.b }[13], [x8] +; CHECK-NEXT: ld1 { v6.b }[13], [x8] ; CHECK-NEXT: add x10, sp, #1328 -; CHECK-NEXT: ld1 { v16.b }[13], [x11] +; CHECK-NEXT: ld1 { v17.b }[13], [x11] ; CHECK-NEXT: add x11, sp, #944 ; CHECK-NEXT: add x9, sp, #184 ; CHECK-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-NEXT: ld1 { v17.b }[12], [x11] +; CHECK-NEXT: ld1 { v16.b }[12], [x11] ; CHECK-NEXT: add x8, sp, #1216 ; CHECK-NEXT: add x12, sp, #832 ; CHECK-NEXT: ld1 { v3.b }[13], [x9] -; CHECK-NEXT: ld1 { v4.b }[14], [x8] -; CHECK-NEXT: ld1 { v16.b }[14], [x12] +; CHECK-NEXT: ld1 { v6.b }[14], [x8] +; CHECK-NEXT: ld1 { v17.b }[14], [x12] ; CHECK-NEXT: add x9, sp, #1336 ; CHECK-NEXT: add x10, sp, #952 ; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v17.b }[13], [x10] +; CHECK-NEXT: ld1 { v16.b }[13], [x10] ; CHECK-NEXT: add x8, sp, #1224 ; CHECK-NEXT: add x11, sp, #840 -; CHECK-NEXT: ld1 { v4.b }[15], [x8] +; CHECK-NEXT: ld1 { v6.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v16.b }[15], [x11] +; CHECK-NEXT: ld1 { v17.b }[15], [x11] ; CHECK-NEXT: add x10, sp, #1344 ; CHECK-NEXT: add x11, sp, #960 ; CHECK-NEXT: ld1 { v3.b }[14], [x8] ; CHECK-NEXT: ld1 { v7.b }[14], [x10] -; CHECK-NEXT: ld1 { v17.b }[14], [x11] +; CHECK-NEXT: ld1 { v16.b }[14], [x11] ; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: sdot v6.4s, v1.16b, v0.16b +; CHECK-NEXT: sdot v5.4s, v1.16b, v0.16b ; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: sdot v5.4s, v16.16b, v4.16b +; CHECK-NEXT: sdot v4.4s, v17.16b, v6.16b ; CHECK-NEXT: ld1 { v2.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #1352 ; CHECK-NEXT: add x10, sp, #968 ; CHECK-NEXT: ld1 { v3.b }[15], [x8] ; CHECK-NEXT: ld1 { v7.b }[15], [x9] -; CHECK-NEXT: ld1 { v17.b }[15], [x10] -; CHECK-NEXT: sdot v6.4s, v3.16b, v2.16b -; CHECK-NEXT: sdot v5.4s, v17.16b, v7.16b -; CHECK-NEXT: add v0.4s, v6.4s, v5.4s +; CHECK-NEXT: ld1 { v16.b }[15], [x10] +; CHECK-NEXT: sdot v5.4s, v3.16b, v2.16b +; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b +; CHECK-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2662,195 +2662,195 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b1, [sp, #208] +; CHECK-NEXT: ldr b5, [sp, #208] ; CHECK-NEXT: add x8, sp, #216 ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr b5, [sp, #976] +; CHECK-NEXT: ldr b4, [sp, #976] ; CHECK-NEXT: add x9, sp, #984 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: ld1 { v5.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: movi v2.16b, #1 +; CHECK-NEXT: movi v1.16b, #1 ; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: ld1 { v5.b }[1], [x9] -; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: add x11, sp, #992 ; CHECK-NEXT: ldr b6, [sp, #720] ; CHECK-NEXT: ldr b7, [sp, #80] -; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: add x13, sp, #88 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v4.b }[2], [x11] ; CHECK-NEXT: ld1 { v7.b }[1], [x13] ; CHECK-NEXT: add x13, sp, #856 ; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add x14, sp, #744 +; CHECK-NEXT: add x14, sp, #1008 ; CHECK-NEXT: add x15, sp, #872 -; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: ld1 { v5.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #240 ; CHECK-NEXT: add x16, sp, #888 ; CHECK-NEXT: add x10, sp, #16 ; CHECK-NEXT: add x9, sp, #24 ; CHECK-NEXT: add x11, sp, #40 -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v5.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: ld1 { v5.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-NEXT: ld1 { v5.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #264 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: ld1 { v5.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #272 -; CHECK-NEXT: ld1 { v1.b }[8], [x8] +; CHECK-NEXT: ld1 { v5.b }[8], [x8] ; CHECK-NEXT: add x8, sp, #280 ; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v1.b }[9], [x8] +; CHECK-NEXT: ld1 { v5.b }[9], [x8] ; CHECK-NEXT: add x8, sp, #288 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ld1 { v1.b }[10], [x8] +; CHECK-NEXT: ld1 { v5.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #296 ; CHECK-NEXT: ld1 { v0.b }[8], [x10] ; CHECK-NEXT: add x10, sp, #128 -; CHECK-NEXT: ld1 { v1.b }[11], [x8] +; CHECK-NEXT: ld1 { v5.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #304 ; CHECK-NEXT: ld1 { v0.b }[9], [x9] ; CHECK-NEXT: add x9, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[12], [x8] +; CHECK-NEXT: ld1 { v5.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v1.b }[13], [x8] +; CHECK-NEXT: ld1 { v5.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v1.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x8] ; CHECK-NEXT: add x8, sp, #32 ; CHECK-NEXT: ld1 { v0.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #144 -; CHECK-NEXT: ld1 { v1.b }[15], [x12] +; CHECK-NEXT: ld1 { v5.b }[15], [x12] ; CHECK-NEXT: add x12, sp, #728 ; CHECK-NEXT: ld1 { v6.b }[1], [x12] ; CHECK-NEXT: add x12, sp, #1000 ; CHECK-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-NEXT: ld1 { v5.b }[3], [x12] +; CHECK-NEXT: ld1 { v4.b }[3], [x12] ; CHECK-NEXT: add x12, sp, #736 ; CHECK-NEXT: add x11, sp, #920 -; CHECK-NEXT: sdot v4.4s, v1.16b, v2.16b -; CHECK-NEXT: ldr b1, [sp, #848] +; CHECK-NEXT: sdot v3.4s, v5.16b, v1.16b +; CHECK-NEXT: ldr b5, [sp, #848] ; CHECK-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #1008 -; CHECK-NEXT: ld1 { v1.b }[1], [x13] -; CHECK-NEXT: ld1 { v5.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #96 -; CHECK-NEXT: ld1 { v7.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #1016 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v6.b }[3], [x14] -; CHECK-NEXT: add x14, sp, #864 -; CHECK-NEXT: ld1 { v0.b }[12], [x13] -; CHECK-NEXT: ld1 { v1.b }[2], [x14] -; CHECK-NEXT: add x14, sp, #752 -; CHECK-NEXT: ld1 { v5.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #104 -; CHECK-NEXT: ld1 { v6.b }[4], [x14] -; CHECK-NEXT: add x14, sp, #1024 -; CHECK-NEXT: ld1 { v7.b }[3], [x12] -; CHECK-NEXT: ld1 { v1.b }[3], [x15] +; CHECK-NEXT: add x12, sp, #48 +; CHECK-NEXT: ld1 { v5.b }[1], [x13] +; CHECK-NEXT: add x13, sp, #744 +; CHECK-NEXT: ld1 { v4.b }[4], [x14] +; CHECK-NEXT: add x14, sp, #96 +; CHECK-NEXT: ld1 { v0.b }[12], [x12] +; CHECK-NEXT: ld1 { v6.b }[3], [x13] +; CHECK-NEXT: add x13, sp, #864 +; CHECK-NEXT: ld1 { v7.b }[2], [x14] +; CHECK-NEXT: add x14, sp, #1016 +; CHECK-NEXT: ld1 { v5.b }[2], [x13] +; CHECK-NEXT: add x13, sp, #752 +; CHECK-NEXT: ld1 { v4.b }[5], [x14] +; CHECK-NEXT: add x14, sp, #104 +; CHECK-NEXT: ld1 { v6.b }[4], [x13] +; CHECK-NEXT: add x13, sp, #1024 +; CHECK-NEXT: ld1 { v7.b }[3], [x14] +; CHECK-NEXT: ld1 { v5.b }[3], [x15] ; CHECK-NEXT: add x15, sp, #760 -; CHECK-NEXT: ld1 { v5.b }[6], [x14] -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x14, sp, #880 +; CHECK-NEXT: add x14, sp, #112 +; CHECK-NEXT: ld1 { v4.b }[6], [x13] +; CHECK-NEXT: add x13, sp, #880 ; CHECK-NEXT: ld1 { v6.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #1032 -; CHECK-NEXT: ld1 { v7.b }[4], [x12] -; CHECK-NEXT: ld1 { v1.b }[4], [x14] +; CHECK-NEXT: ld1 { v7.b }[4], [x14] +; CHECK-NEXT: ld1 { v5.b }[4], [x13] ; CHECK-NEXT: add x14, sp, #768 -; CHECK-NEXT: ld1 { v5.b }[7], [x15] -; CHECK-NEXT: add x12, sp, #120 +; CHECK-NEXT: add x13, sp, #120 +; CHECK-NEXT: ld1 { v4.b }[7], [x15] ; CHECK-NEXT: add x15, sp, #1040 ; CHECK-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-NEXT: ld1 { v7.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #776 -; CHECK-NEXT: ld1 { v1.b }[5], [x16] -; CHECK-NEXT: ld1 { v5.b }[8], [x15] -; CHECK-NEXT: add x15, sp, #896 +; CHECK-NEXT: ld1 { v7.b }[5], [x13] +; CHECK-NEXT: add x13, sp, #776 +; CHECK-NEXT: ld1 { v5.b }[5], [x16] ; CHECK-NEXT: add x14, sp, #1048 -; CHECK-NEXT: ld1 { v6.b }[7], [x12] +; CHECK-NEXT: ld1 { v4.b }[8], [x15] +; CHECK-NEXT: add x15, sp, #896 +; CHECK-NEXT: ld1 { v6.b }[7], [x13] ; CHECK-NEXT: ld1 { v7.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #784 -; CHECK-NEXT: ld1 { v1.b }[6], [x15] -; CHECK-NEXT: ld1 { v5.b }[9], [x14] +; CHECK-NEXT: ld1 { v5.b }[6], [x15] +; CHECK-NEXT: add x13, sp, #1056 +; CHECK-NEXT: ld1 { v4.b }[9], [x14] ; CHECK-NEXT: add x14, sp, #904 -; CHECK-NEXT: add x12, sp, #1056 ; CHECK-NEXT: ld1 { v6.b }[8], [x10] ; CHECK-NEXT: ld1 { v7.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #792 -; CHECK-NEXT: ld1 { v1.b }[7], [x14] -; CHECK-NEXT: ld1 { v5.b }[10], [x12] -; CHECK-NEXT: add x12, sp, #912 +; CHECK-NEXT: ld1 { v5.b }[7], [x14] ; CHECK-NEXT: add x10, sp, #1064 +; CHECK-NEXT: ld1 { v4.b }[10], [x13] +; CHECK-NEXT: add x13, sp, #912 ; CHECK-NEXT: ld1 { v6.b }[9], [x9] ; CHECK-NEXT: ld1 { v7.b }[8], [x8] ; CHECK-NEXT: add x9, sp, #800 -; CHECK-NEXT: ld1 { v1.b }[8], [x12] -; CHECK-NEXT: ld1 { v5.b }[11], [x10] +; CHECK-NEXT: ld1 { v5.b }[8], [x13] ; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: ld1 { v4.b }[11], [x10] ; CHECK-NEXT: add x10, sp, #1072 ; CHECK-NEXT: ld1 { v6.b }[10], [x9] ; CHECK-NEXT: ld1 { v7.b }[9], [x8] ; CHECK-NEXT: add x9, sp, #808 -; CHECK-NEXT: ld1 { v1.b }[9], [x11] -; CHECK-NEXT: ld1 { v5.b }[12], [x10] -; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v5.b }[9], [x11] ; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ld1 { v4.b }[12], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[13], [x8] ; CHECK-NEXT: ld1 { v6.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #928 ; CHECK-NEXT: ld1 { v7.b }[10], [x10] ; CHECK-NEXT: add x10, sp, #1080 -; CHECK-NEXT: ld1 { v1.b }[10], [x9] -; CHECK-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-NEXT: ld1 { v5.b }[13], [x10] +; CHECK-NEXT: ld1 { v5.b }[10], [x9] ; CHECK-NEXT: add x8, sp, #816 +; CHECK-NEXT: ld1 { v4.b }[13], [x10] ; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: add x10, sp, #176 ; CHECK-NEXT: ld1 { v6.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #936 ; CHECK-NEXT: ld1 { v7.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #1088 -; CHECK-NEXT: ld1 { v1.b }[11], [x8] -; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: ld1 { v5.b }[14], [x9] -; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v5.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: ld1 { v4.b }[14], [x9] +; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v0.b }[14], [x8] ; CHECK-NEXT: ld1 { v6.b }[13], [x9] ; CHECK-NEXT: add x9, sp, #944 ; CHECK-NEXT: ld1 { v7.b }[12], [x10] ; CHECK-NEXT: add x10, sp, #1096 -; CHECK-NEXT: ld1 { v1.b }[12], [x9] -; CHECK-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-NEXT: ld1 { v5.b }[15], [x10] +; CHECK-NEXT: ld1 { v5.b }[12], [x9] ; CHECK-NEXT: add x8, sp, #832 +; CHECK-NEXT: ld1 { v4.b }[15], [x10] ; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v6.b }[14], [x8] ; CHECK-NEXT: add x8, sp, #952 ; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v1.b }[13], [x8] -; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ld1 { v5.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #840 -; CHECK-NEXT: sdot v3.4s, v5.16b, v2.16b ; CHECK-NEXT: ld1 { v0.b }[15], [x10] +; CHECK-NEXT: sdot v2.4s, v4.16b, v1.16b ; CHECK-NEXT: add x9, sp, #192 ; CHECK-NEXT: ld1 { v6.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #960 ; CHECK-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-NEXT: ld1 { v1.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x8] +; CHECK-NEXT: sdot v3.4s, v0.16b, v1.16b ; CHECK-NEXT: add x8, sp, #200 ; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: sdot v4.4s, v0.16b, v2.16b -; CHECK-NEXT: sdot v3.4s, v6.16b, v2.16b +; CHECK-NEXT: sdot v2.4s, v6.16b, v1.16b ; CHECK-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-NEXT: ld1 { v1.b }[15], [x9] -; CHECK-NEXT: sdot v4.4s, v7.16b, v2.16b -; CHECK-NEXT: sdot v3.4s, v1.16b, v2.16b -; CHECK-NEXT: add v0.4s, v4.4s, v3.4s +; CHECK-NEXT: ld1 { v5.b }[15], [x9] +; CHECK-NEXT: sdot v3.4s, v7.16b, v1.16b +; CHECK-NEXT: sdot v2.4s, v5.16b, v1.16b +; CHECK-NEXT: add v0.4s, v3.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 6d2305059ce88..913205f327536 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -459,48 +459,48 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: ld1 { v1.b }[1], [x9] -; CHECK-NEXT: add x12, sp, #16 +; CHECK-NEXT: add x11, sp, #16 ; CHECK-NEXT: add x9, sp, #112 ; CHECK-NEXT: add x13, sp, #184 ; CHECK-NEXT: ld1 { v2.b }[2], [x10] -; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: add x12, sp, #120 ; CHECK-NEXT: add x14, sp, #32 -; CHECK-NEXT: ld1 { v3.b }[2], [x12] +; CHECK-NEXT: ld1 { v3.b }[2], [x11] ; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ldr b5, [sp, #64] ; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ldr b4, [sp, #224] -; CHECK-NEXT: add x10, sp, #128 +; CHECK-NEXT: add x11, sp, #128 ; CHECK-NEXT: ld1 { v2.b }[3], [x13] ; CHECK-NEXT: add x13, sp, #24 -; CHECK-NEXT: add x12, sp, #136 +; CHECK-NEXT: add x10, sp, #136 ; CHECK-NEXT: ld1 { v3.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: ld1 { v1.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #192 ; CHECK-NEXT: add x13, sp, #200 ; CHECK-NEXT: add x15, sp, #80 ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v2.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #232 +; CHECK-NEXT: ld1 { v2.b }[4], [x12] +; CHECK-NEXT: add x12, sp, #232 ; CHECK-NEXT: ld1 { v3.b }[4], [x14] ; CHECK-NEXT: add x14, sp, #72 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] +; CHECK-NEXT: ld1 { v4.b }[1], [x12] ; CHECK-NEXT: ld1 { v5.b }[1], [x14] ; CHECK-NEXT: add x14, sp, #40 -; CHECK-NEXT: ld1 { v1.b }[4], [x10] +; CHECK-NEXT: ld1 { v1.b }[4], [x11] ; CHECK-NEXT: ld1 { v2.b }[5], [x13] -; CHECK-NEXT: add x11, sp, #208 +; CHECK-NEXT: add x12, sp, #208 ; CHECK-NEXT: add x13, sp, #48 ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: ld1 { v3.b }[5], [x14] ; CHECK-NEXT: add x14, sp, #240 ; CHECK-NEXT: ld1 { v4.b }[2], [x14] ; CHECK-NEXT: ld1 { v5.b }[2], [x15] -; CHECK-NEXT: ld1 { v1.b }[5], [x12] -; CHECK-NEXT: ld1 { v2.b }[6], [x11] -; CHECK-NEXT: add x10, sp, #216 -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-NEXT: ld1 { v2.b }[6], [x12] +; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: ld1 { v3.b }[6], [x13] ; CHECK-NEXT: add x12, sp, #248 ; CHECK-NEXT: add x13, sp, #88 @@ -508,9 +508,9 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: ld1 { v4.b }[3], [x12] ; CHECK-NEXT: ld1 { v5.b }[3], [x13] ; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x10] +; CHECK-NEXT: ld1 { v2.b }[7], [x11] ; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v3.b }[7], [x11] +; CHECK-NEXT: ld1 { v3.b }[7], [x10] ; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b ; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[7], [x9] @@ -545,23 +545,23 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: ldr w12, [sp, #112] +; CHECK-NEXT: ldr w13, [sp, #112] ; CHECK-NEXT: ldr w14, [sp, #144] ; CHECK-NEXT: fmov s2, w4 -; CHECK-NEXT: ldr w16, [sp, #176] +; CHECK-NEXT: ldr w17, [sp, #176] ; CHECK-NEXT: ldr w19, [sp, #208] ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: ldr w20, [sp, #80] ; CHECK-NEXT: ldr w21, [sp, #48] -; CHECK-NEXT: fmov s5, w12 +; CHECK-NEXT: fmov s5, w13 ; CHECK-NEXT: fmov s4, w19 -; CHECK-NEXT: fmov s6, w16 +; CHECK-NEXT: fmov s6, w17 ; CHECK-NEXT: fmov s7, w14 ; CHECK-NEXT: fmov s0, w20 ; CHECK-NEXT: fmov s1, w21 ; CHECK-NEXT: ldr w10, [sp, #120] ; CHECK-NEXT: ldr w11, [sp, #152] -; CHECK-NEXT: ldr w13, [sp, #184] +; CHECK-NEXT: ldr w12, [sp, #184] ; CHECK-NEXT: ldr w15, [sp, #216] ; CHECK-NEXT: ldr w22, [sp, #88] ; CHECK-NEXT: ldr w23, [sp, #56] @@ -571,18 +571,18 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-NEXT: mov v4.h[1], w15 ; CHECK-NEXT: mov v0.h[1], w22 ; CHECK-NEXT: mov v1.h[1], w23 -; CHECK-NEXT: mov v6.h[1], w13 +; CHECK-NEXT: mov v6.h[1], w12 ; CHECK-NEXT: mov v7.h[1], w11 ; CHECK-NEXT: ldr w8, [sp, #128] ; CHECK-NEXT: ldr w9, [sp, #160] -; CHECK-NEXT: ldr w17, [sp, #64] +; CHECK-NEXT: ldr w16, [sp, #64] ; CHECK-NEXT: ldr w18, [sp, #96] ; CHECK-NEXT: ldr w10, [sp, #192] ; CHECK-NEXT: ldr w11, [sp, #224] ; CHECK-NEXT: mov v2.h[2], w6 ; CHECK-NEXT: mov v3.h[2], w2 ; CHECK-NEXT: mov v0.h[2], w18 -; CHECK-NEXT: mov v1.h[2], w17 +; CHECK-NEXT: mov v1.h[2], w16 ; CHECK-NEXT: mov v5.h[2], w8 ; CHECK-NEXT: mov v4.h[2], w11 ; CHECK-NEXT: mov v6.h[2], w10 diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll index 623429c1085ac..ba9a5398298af 100644 --- a/llvm/test/CodeGen/AArch64/pow.ll +++ b/llvm/test/CodeGen/AArch64/pow.ll @@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: fmov d1, #0.25000000 ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl pow ; CHECK-NEXT: fmov d1, #0.25000000 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl pow -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index 43f40badc1ae2..419f25c22eb72 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -29,22 +29,24 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: // implicit-def: $q6 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: add x10, x10, :lo12:B+48 ; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: add x11, x11, :lo12:A +; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // implicit-def: $q7 -; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 -; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q16 ; CHECK-NEXT: // implicit-def: $q17 -; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q10 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 @@ -56,179 +58,170 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q26 ; CHECK-NEXT: // implicit-def: $q28 ; CHECK-NEXT: // implicit-def: $q30 -; CHECK-NEXT: // implicit-def: $q15 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q31 -; CHECK-NEXT: // implicit-def: $q11 -; CHECK-NEXT: // implicit-def: $q9 -; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q12 ; CHECK-NEXT: // implicit-def: $q13 +; CHECK-NEXT: // implicit-def: $q11 +; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q6 ; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: ldr x14, [x12] +; CHECK-NEXT: ldr q15, [x12] ; CHECK-NEXT: add x7, x11, x8 -; CHECK-NEXT: ldr x13, [x12] +; CHECK-NEXT: fmov x15, d14 +; CHECK-NEXT: mov x16, v14.d[1] +; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: fmov x18, d15 +; CHECK-NEXT: mov x13, v15.d[1] ; CHECK-NEXT: ldr x5, [x8] -; CHECK-NEXT: ldr x7, [x7, #128] -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: stp q22, q26, [sp] // 32-byte Folded Spill -; CHECK-NEXT: mov v22.16b, v9.16b -; CHECK-NEXT: stp q31, q15, [sp, #32] // 32-byte Folded Spill -; CHECK-NEXT: ldr q15, [x12] -; CHECK-NEXT: fmov x12, d14 ; CHECK-NEXT: ldr q14, [x10], #64 -; CHECK-NEXT: mov v9.16b, v30.16b -; CHECK-NEXT: fmov x17, d15 -; CHECK-NEXT: mov x16, v15.d[1] -; CHECK-NEXT: mov v30.16b, v27.16b -; CHECK-NEXT: mul x15, x12, x13 -; CHECK-NEXT: mov x0, v14.d[1] +; CHECK-NEXT: ldr x7, [x7, #128] +; CHECK-NEXT: mul x17, x15, x14 +; CHECK-NEXT: mov v6.16b, v0.16b +; CHECK-NEXT: mov v9.16b, v27.16b +; CHECK-NEXT: mov x12, v14.d[1] ; CHECK-NEXT: fmov x4, d14 ; CHECK-NEXT: mov v27.16b, v23.16b +; CHECK-NEXT: mul x1, x16, x14 ; CHECK-NEXT: mov v23.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v2.16b -; CHECK-NEXT: mul x1, x14, x13 -; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v20.16b +; CHECK-NEXT: mov v19.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v2.16b +; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill +; CHECK-NEXT: mov v31.16b, v22.16b +; CHECK-NEXT: mul x0, x18, x14 +; CHECK-NEXT: mov v26.16b, v10.16b +; CHECK-NEXT: mov v22.16b, v5.16b +; CHECK-NEXT: fmov d15, x17 +; CHECK-NEXT: mov v5.16b, v1.16b +; CHECK-NEXT: mov v8.16b, v20.16b +; CHECK-NEXT: mul x2, x13, x14 ; CHECK-NEXT: mov v20.16b, v16.16b ; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: mul x18, x17, x13 -; CHECK-NEXT: mov v31.16b, v18.16b -; CHECK-NEXT: mov v26.16b, v5.16b -; CHECK-NEXT: fmov d15, x15 -; CHECK-NEXT: mov v5.16b, v1.16b -; CHECK-NEXT: mov v18.16b, v10.16b -; CHECK-NEXT: mul x2, x16, x13 -; CHECK-NEXT: mov v10.16b, v29.16b -; CHECK-NEXT: mov v29.16b, v25.16b -; CHECK-NEXT: mov v25.16b, v21.16b +; CHECK-NEXT: mov v10.16b, v21.16b ; CHECK-NEXT: mov v21.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v4.16b ; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x19, x12, x5 +; CHECK-NEXT: mul x3, x12, x14 ; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: fmov d14, x18 +; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: add x9, x9, #1 -; CHECK-NEXT: mul x12, x12, x7 +; CHECK-NEXT: mul x14, x4, x14 +; CHECK-NEXT: add v18.2d, v18.2d, v15.2d +; CHECK-NEXT: mul x19, x15, x5 ; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: add v12.2d, v12.2d, v15.2d -; CHECK-NEXT: mul x3, x0, x13 +; CHECK-NEXT: mul x15, x15, x7 +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload +; CHECK-NEXT: mul x6, x16, x5 ; CHECK-NEXT: fmov d1, x19 -; CHECK-NEXT: mul x13, x4, x13 -; CHECK-NEXT: fmov d2, x12 -; CHECK-NEXT: mul x6, x14, x5 -; CHECK-NEXT: add v6.2d, v13.2d, v14.2d -; CHECK-NEXT: mov v13.16b, v12.16b -; CHECK-NEXT: ldr q12, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x14, x14, x7 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v12.2d, v12.2d, v14.2d -; CHECK-NEXT: mul x21, x17, x7 -; CHECK-NEXT: mov v1.d[1], x6 -; CHECK-NEXT: mul x18, x4, x7 ; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mov v2.d[1], x14 -; CHECK-NEXT: str q12, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v12.16b, v13.16b -; CHECK-NEXT: mul x13, x17, x5 -; CHECK-NEXT: mov v13.16b, v6.16b +; CHECK-NEXT: mul x16, x16, x7 +; CHECK-NEXT: fmov d2, x15 +; CHECK-NEXT: add v15.2d, v15.2d, v14.2d +; CHECK-NEXT: mul x21, x18, x7 +; CHECK-NEXT: mov v1.d[1], x6 +; CHECK-NEXT: mul x0, x4, x7 +; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: add v15.2d, v11.2d, v14.2d +; CHECK-NEXT: mov v2.d[1], x16 +; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mul x20, x13, x7 ; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: ldp q15, q6, [sp, #48] // 32-byte Folded Reload -; CHECK-NEXT: mul x20, x16, x7 -; CHECK-NEXT: add v11.2d, v11.2d, v1.2d -; CHECK-NEXT: fmov d4, x18 -; CHECK-NEXT: mul x22, x0, x7 -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d -; CHECK-NEXT: add v15.2d, v15.2d, v2.2d -; CHECK-NEXT: fmov d14, x13 -; CHECK-NEXT: mov v2.16b, v19.16b +; CHECK-NEXT: add v11.2d, v11.2d, v0.2d +; CHECK-NEXT: add v12.2d, v12.2d, v1.2d +; CHECK-NEXT: mul x22, x12, x7 +; CHECK-NEXT: fmov d4, x0 +; CHECK-NEXT: add v18.2d, v18.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v7.16b +; CHECK-NEXT: mul x14, x18, x5 +; CHECK-NEXT: mov v7.16b, v19.16b ; CHECK-NEXT: mov v19.16b, v23.16b -; CHECK-NEXT: mul x14, x4, x5 -; CHECK-NEXT: mov v23.16b, v27.16b -; CHECK-NEXT: mov v27.16b, v30.16b ; CHECK-NEXT: mov v3.d[1], x20 -; CHECK-NEXT: mov v30.16b, v9.16b -; CHECK-NEXT: mov v9.16b, v22.16b -; CHECK-NEXT: mul x12, x16, x5 -; CHECK-NEXT: str q6, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: mov v6.16b, v18.16b +; CHECK-NEXT: mov v23.16b, v27.16b +; CHECK-NEXT: mov v27.16b, v9.16b +; CHECK-NEXT: mul x15, x4, x5 +; CHECK-NEXT: add v27.2d, v9.2d, v1.2d +; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov v4.d[1], x22 -; CHECK-NEXT: add v27.2d, v27.2d, v1.2d -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: mul x13, x0, x5 ; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v2.2d, v2.2d, v1.2d -; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: add v7.2d, v7.2d, v1.2d +; CHECK-NEXT: mul x13, x13, x5 +; CHECK-NEXT: add v23.2d, v23.2d, v1.2d +; CHECK-NEXT: add v1.2d, v5.2d, v1.2d +; CHECK-NEXT: fmov d14, x14 ; CHECK-NEXT: add v30.2d, v30.2d, v3.2d ; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: mul x12, x12, x5 ; CHECK-NEXT: mov v16.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v28.16b -; CHECK-NEXT: mov v14.d[1], x12 -; CHECK-NEXT: mov v28.16b, v8.16b -; CHECK-NEXT: add v1.2d, v5.2d, v1.2d -; CHECK-NEXT: add v28.2d, v8.2d, v4.2d +; CHECK-NEXT: mov v5.16b, v22.16b +; CHECK-NEXT: fmov d0, x15 +; CHECK-NEXT: add v28.2d, v28.2d, v4.2d ; CHECK-NEXT: mov v4.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v21.16b -; CHECK-NEXT: mov v0.d[1], x13 -; CHECK-NEXT: mov v21.16b, v25.16b -; CHECK-NEXT: mov v25.16b, v29.16b -; CHECK-NEXT: mov v29.16b, v10.16b -; CHECK-NEXT: mov v5.16b, v26.16b -; CHECK-NEXT: mov v18.16b, v31.16b -; CHECK-NEXT: ldp q22, q26, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr q31, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v14.2d -; CHECK-NEXT: add v24.2d, v24.2d, v14.2d -; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: mov v21.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v26.16b +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mov v22.16b, v31.16b +; CHECK-NEXT: mov v20.16b, v8.16b +; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload +; CHECK-NEXT: mov v11.16b, v15.16b +; CHECK-NEXT: mov v0.d[1], x12 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d ; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v18.2d, v18.2d, v14.2d -; CHECK-NEXT: add v16.2d, v16.2d, v14.2d ; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v24.2d, v24.2d, v14.2d ; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v20.2d, v8.2d, v14.2d +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d ; CHECK-NEXT: add v5.2d, v5.2d, v14.2d ; CHECK-NEXT: add v3.2d, v3.2d, v14.2d -; CHECK-NEXT: add v10.2d, v6.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d ; CHECK-NEXT: add v29.2d, v29.2d, v0.2d ; CHECK-NEXT: add v25.2d, v25.2d, v0.2d ; CHECK-NEXT: add v21.2d, v21.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d ; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d +; CHECK-NEXT: add v0.2d, v6.2d, v0.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q13, q12, [x8] -; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: stp q9, q11, [x8, #64] +; CHECK-NEXT: stp q12, q31, [x8, #80] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: stp q15, q30, [x8, #144] +; CHECK-NEXT: str q6, [x8] +; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: str q29, [x8, #112] ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload -; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q6, q11, [x8, #16] +; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q18, q30, [x8, #144] ; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q0, q6, [x8, #32] +; CHECK-NEXT: stp q6, q13, [x8, #48] ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q31, q29, [x8, #96] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: stp q28, q26, [x8, #176] +; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: stp q19, q10, [x8, #336] +; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-NEXT: str q27, [x8, #208] ; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q19, q18, [x8, #336] ; CHECK-NEXT: stp q17, q16, [x8, #368] -; CHECK-NEXT: stp q2, q5, [x8, #400] -; CHECK-NEXT: stp q1, q10, [x8, #464] -; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q7, [x8, #496] +; CHECK-NEXT: stp q7, q5, [x8, #400] +; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 5f7a22ed055c8..70987df1c9c04 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: punpklo p2.h, p0.b -; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: punpklo p2.h, p0.b -; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index 6e29f7cbabcc8..a5303c901b80f 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -33,75 +33,75 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: umov w9, v0.b[9] ; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: umov w11, v0.b[15] -; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: umov w8, v0.b[10] -; CHECK-NEXT: mov v2.b[1], w10 +; CHECK-NEXT: mov v1.b[1], w10 ; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: mov v1.b[1], w9 +; CHECK-NEXT: mov v2.b[1], w9 ; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: mov v1.b[2], w8 +; CHECK-NEXT: mov v2.b[2], w8 ; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: mov v2.b[2], w9 +; CHECK-NEXT: mov v1.b[2], w9 ; CHECK-NEXT: umov w9, v0.b[12] -; CHECK-NEXT: mov v1.b[3], w10 +; CHECK-NEXT: mov v2.b[3], w10 ; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: mov v2.b[3], w8 +; CHECK-NEXT: mov v1.b[3], w8 ; CHECK-NEXT: umov w8, v0.b[13] -; CHECK-NEXT: mov v1.b[4], w9 +; CHECK-NEXT: mov v2.b[4], w9 ; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: mov v2.b[4], w10 +; CHECK-NEXT: mov v1.b[4], w10 ; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: mov v1.b[5], w8 +; CHECK-NEXT: mov v2.b[5], w8 ; CHECK-NEXT: umov w8, v0.b[6] -; CHECK-NEXT: mov v2.b[5], w9 +; CHECK-NEXT: mov v1.b[5], w9 ; CHECK-NEXT: umov w9, v0.b[7] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: mov v1.b[6], w10 -; CHECK-NEXT: mov v2.b[6], w8 +; CHECK-NEXT: mov v2.b[6], w10 +; CHECK-NEXT: mov v1.b[6], w8 ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov x10, #8 // =0x8 ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: mov v1.b[7], w11 -; CHECK-NEXT: mov v2.b[7], w9 +; CHECK-NEXT: mov v2.b[7], w11 +; CHECK-NEXT: mov v1.b[7], w9 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: mov x9, #24 // =0x18 -; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] -; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] ; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 ; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 756e25f8e3368..d6adf9cf0ad67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -215,44 +215,44 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z3.h, z0.h[2] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z5.h, z4.h[3] -; CHECK-NEXT: fcvtzu x10, h4 -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x13, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: mov z2.h, z4.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z1.h[3] +; CHECK-NEXT: mov z4.h, z1.h[2] ; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: fcvtzu x9, h3 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: fcvtzu x10, h0 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fcvtzu x11, h3 +; CHECK-NEXT: fcvtzu x12, h4 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: mov z1.h, z1.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzu x8, h2 +; CHECK-NEXT: fcvtzu x9, h4 ; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzu x11, h0 -; CHECK-NEXT: mov z1.h, z4.h[2] -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: fcvtzu x12, h2 +; CHECK-NEXT: fcvtzu x11, h1 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: fcvtzu x12, h3 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzu x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: mov z0.h, z4.h[1] -; CHECK-NEXT: mov z1.h, z4.h[3] -; CHECK-NEXT: mov z2.h, z4.h[2] -; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: fcvtzu x11, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: fcvtzu x12, h1 +; CHECK-NEXT: fcvtzu x10, h1 +; CHECK-NEXT: fcvtzu x12, h2 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzu x8, h2 +; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] @@ -965,44 +965,44 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z3.h, z0.h[2] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z5.h, z4.h[3] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x13, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: mov z2.h, z4.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z1.h[3] +; CHECK-NEXT: mov z4.h, z1.h[2] ; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: fcvtzs x9, h3 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: fcvtzs x10, h0 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fcvtzs x11, h3 +; CHECK-NEXT: fcvtzs x12, h4 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: mov z1.h, z1.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzs x8, h2 +; CHECK-NEXT: fcvtzs x9, h4 ; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: mov z1.h, z4.h[2] -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: fcvtzs x12, h2 +; CHECK-NEXT: fcvtzs x11, h1 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: fcvtzs x12, h3 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzs x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: mov z0.h, z4.h[1] -; CHECK-NEXT: mov z1.h, z4.h[3] -; CHECK-NEXT: mov z2.h, z4.h[2] -; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: fcvtzs x11, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: fcvtzs x12, h1 +; CHECK-NEXT: fcvtzs x10, h1 +; CHECK-NEXT: fcvtzs x12, h2 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzs x8, h2 +; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 63a4226655a88..5e3ce0ddebe2e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -112,11 +112,10 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.d, z6.d ; CHECK-NEXT: mov z0.d, z3.d ; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.h, z2.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z16.h, z16.b ; CHECK-NEXT: sunpklo z4.h, z0.b @@ -130,15 +129,16 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z16.s, z16.h ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: sunpklo z5.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: sunpklo z4.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z2.b +; CHECK-NEXT: sunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.h, z7.b @@ -146,37 +146,37 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: sunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: sunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: uzp1 z3.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z5.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -479,11 +479,10 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.d, z6.d ; CHECK-NEXT: mov z0.d, z3.d ; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z16.h, z16.b ; CHECK-NEXT: uunpklo z4.h, z0.b @@ -497,15 +496,16 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z16.s, z16.h ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: uunpklo z5.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: uunpklo z4.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z2.b +; CHECK-NEXT: uunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.h, z7.b @@ -513,37 +513,37 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: uunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: uunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uzp1 z3.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z5.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 2c3303d5d3407..eb95a410209b4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -115,19 +115,19 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: sunpklo z6.h, z1.b -; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z1.b +; CHECK-NEXT: sunpklo z16.h, z0.b ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: sunpklo z17.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: sunpklo z17.s, z16.h +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: sunpklo z4.h, z2.b ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z17.s ; CHECK-NEXT: sunpklo z2.s, z4.h ; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -135,34 +135,35 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x1] -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z4.s +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z17.d, z4.d +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 +; CHECK-NEXT: sunpklo z18.h, z18.b ; CHECK-NEXT: sunpklo z17.h, z17.b -; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: ldr q4, [x0] +; CHECK-NEXT: sunpklo z20.s, z18.h +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z19.s, z17.h ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sunpklo z17.s, z17.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: sunpklo z18.h, z18.b -; CHECK-NEXT: sunpklo z20.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: sunpklo z20.h, z4.b -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: sunpklo z20.h, z3.b +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: sunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: sunpklo z20.s, z20.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: sunpklo z18.h, z5.b -; CHECK-NEXT: uzp1 z7.h, z19.h, z19.h +; CHECK-NEXT: sunpklo z18.h, z4.b +; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: sunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h @@ -171,21 +172,21 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z6.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z6.b, z19.b, z19.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z2.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z3.b -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: mls z2.b, p1/m, z6.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z7.b, z1.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -503,19 +504,19 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: uunpklo z6.h, z1.b -; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z1.b +; CHECK-NEXT: uunpklo z16.h, z0.b ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: uunpklo z17.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: uunpklo z17.s, z16.h +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: uunpklo z4.h, z2.b ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z17.s ; CHECK-NEXT: uunpklo z2.s, z4.h ; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -523,34 +524,35 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x1] -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z4.s +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z17.d, z4.d +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 +; CHECK-NEXT: uunpklo z18.h, z18.b ; CHECK-NEXT: uunpklo z17.h, z17.b -; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: ldr q4, [x0] +; CHECK-NEXT: uunpklo z20.s, z18.h +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z19.s, z17.h ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uunpklo z17.s, z17.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: uunpklo z18.h, z18.b -; CHECK-NEXT: uunpklo z20.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: uunpklo z20.h, z4.b -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uunpklo z20.h, z3.b +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: uunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: uunpklo z20.s, z20.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: uunpklo z18.h, z5.b -; CHECK-NEXT: uzp1 z7.h, z19.h, z19.h +; CHECK-NEXT: uunpklo z18.h, z4.b +; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: uunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h @@ -559,21 +561,21 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z6.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z6.b, z19.b, z19.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z2.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z3.b -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: mls z2.b, p1/m, z6.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z7.b, z1.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 40bc43791c45a..1f036fa08ef15 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -92,12 +92,13 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.h, z3.h[5] ; CHECK-NEXT: fmov w9, s17 ; CHECK-NEXT: mov z17.h, z4.h[5] +; CHECK-NEXT: mov z20.h, z7.h[6] ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z3.h[4] ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z7.h[6] +; CHECK-NEXT: mov z19.h, z6.h[7] ; CHECK-NEXT: zip1 z3.h, z4.h, z3.h ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s16 @@ -133,66 +134,65 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.h, z2.h, z5.h ; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z7.h[7] +; CHECK-NEXT: ldr q16, [sp, #16] ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z6.h[7] ; CHECK-NEXT: strh w8, [sp, #50] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: ldr q18, [sp, #16] +; CHECK-NEXT: mov z18.h, z7.h[7] ; CHECK-NEXT: strh w8, [sp, #48] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z6.h[6] -; CHECK-NEXT: ldr q20, [sp, #48] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: ldr q17, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z7.h[5] -; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[5] +; CHECK-NEXT: mov z19.h, z7.h[5] +; CHECK-NEXT: strh w8, [sp, #44] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z6.h[5] ; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z7.h[4] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z7.h[4] ; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z6.h[4] -; CHECK-NEXT: strh w8, [sp, #38] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[7] +; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: strh w8, [sp, #38] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z5.h[7] ; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z2.h[7] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z5.h[6] -; CHECK-NEXT: strh w8, [sp, #32] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: mov z19.h, z5.h[6] +; CHECK-NEXT: strh w8, [sp, #32] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z2.h[6] ; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z5.h[5] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z5.h[5] ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z2.h[5] -; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[4] -; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z19.h, z2.h[5] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z5.h[4] +; CHECK-NEXT: fmov w9, s19 ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z2.h[4] ; CHECK-NEXT: strh w9, [sp, #4] ; CHECK-NEXT: ldr q2, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: add z2.h, z18.h, z2.h +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: add z2.h, z16.h, z2.h ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: strh w8, [sp] ; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: add z1.h, z20.h, z4.h +; CHECK-NEXT: add z1.h, z17.h, z4.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -956,36 +956,39 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: mov z4.h, z3.h[6] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z6.h, z3.h[2] ; CHECK-NEXT: mov z5.h, z3.h[4] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: mov z17.h, z2.h[7] +; CHECK-NEXT: mov z16.h, z3.h[1] ; CHECK-NEXT: strh w8, [sp, #40] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[4] +; CHECK-NEXT: mov z4.h, z2.h[4] ; CHECK-NEXT: strh w9, [sp, #32] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z1.h[2] +; CHECK-NEXT: mov z5.h, z2.h[2] ; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z2.h[2] +; CHECK-NEXT: mov z6.h, z1.h[2] ; CHECK-NEXT: strh w9, [sp, #44] ; CHECK-NEXT: fmov w9, s7 ; CHECK-NEXT: mov z7.h, z0.h[6] ; CHECK-NEXT: strh w8, [sp, #42] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z4.h, z1.h[6] ; CHECK-NEXT: strh w9, [sp, #38] +; CHECK-NEXT: fmov w9, s16 ; CHECK-NEXT: strh w8, [sp, #36] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[4] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strh w9, [sp, #56] ; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: ldr q16, [sp, #32] +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strh w8, [sp] @@ -999,63 +1002,60 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z6.h, z3.h[7] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z1.h[7] +; CHECK-NEXT: mov z7.h, z3.h[5] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z3.h[5] ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: mov z3.h, z3.h[1] +; CHECK-NEXT: ldr q3, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z2.h[7] -; CHECK-NEXT: ldr q6, [sp] +; CHECK-NEXT: mov z6.h, z2.h[5] +; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #56] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z7.h, z1.h[7] ; CHECK-NEXT: strh w8, [sp, #60] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] ; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z1.h, z2.h[1] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z0.h[7] ; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[5] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.h, z1.h[5] ; CHECK-NEXT: strh w9, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w8, s7 ; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.h, z16.h, z0.h +; CHECK-NEXT: add z0.h, z3.h, z0.h ; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.h, z6.h, z1.h +; CHECK-NEXT: add z1.h, z4.h, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -1133,45 +1133,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: mov z2.h, z0.h[6] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z4.h, z0.h[2] -; CHECK-NEXT: mov z6.h, z1.h[4] -; CHECK-NEXT: mov z3.h, z0.h[4] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z4.h, z1.h[2] +; CHECK-NEXT: mov z6.h, z0.h[4] +; CHECK-NEXT: mov z3.h, z1.h[4] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z5.h, z0.h[6] ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z0.h[2] ; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z1.h[7] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z1.h[5] ; CHECK-NEXT: strh w9, [sp, #12] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: mov z2.h, z0.h[7] ; CHECK-NEXT: strh w9, [sp, #24] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z4.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 7623e35654162..61439021a8875 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -5828,29 +5828,28 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v0 :: v_dual_cndmask_b32 v5, v8, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v0 :: v_dual_cndmask_b32 v6, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 ; GFX11-NEXT: v_readfirstlane_b32 s0, v18 ; GFX11-NEXT: v_readfirstlane_b32 s1, v17 ; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v9, v0 :: v_dual_cndmask_b32 v8, v10, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v11, v0, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, v1, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v15, v0, s9 -; GFX11-NEXT: v_readfirstlane_b32 s3, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v13, v0 :: v_dual_cndmask_b32 v12, v14, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v15, v0, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v16, v1, s9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 -; GFX11-NEXT: v_readfirstlane_b32 s10, v10 -; GFX11-NEXT: v_readfirstlane_b32 s11, v14 -; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 0255a77aa0ffd..eb3f74be71de0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -838,165 +838,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s21, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_readfirstlane_b32 s23, v1 -; GFX7-NEXT: v_readfirstlane_b32 s19, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8 -; GFX7-NEXT: v_mov_b32_e32 v4, s11 ; GFX7-NEXT: s_mul_i32 s18, s16, s10 -; GFX7-NEXT: v_readfirstlane_b32 s24, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: v_readfirstlane_b32 s22, v3 -; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s20, s1, s9 -; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX7-NEXT: v_readfirstlane_b32 s19, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_add_u32 s18, s20, s18 -; GFX7-NEXT: v_readfirstlane_b32 s25, v3 -; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX7-NEXT: s_addc_u32 s19, s21, s19 ; GFX7-NEXT: s_mul_i32 s21, s2, s8 +; GFX7-NEXT: v_readfirstlane_b32 s23, v1 +; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: s_cselect_b32 s20, 1, 0 +; GFX7-NEXT: v_readfirstlane_b32 s22, v3 ; GFX7-NEXT: s_add_u32 s18, s21, s18 -; GFX7-NEXT: v_readfirstlane_b32 s28, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_addc_u32 s19, s22, s19 ; GFX7-NEXT: s_mul_i32 s22, s16, s9 -; GFX7-NEXT: v_readfirstlane_b32 s27, v5 -; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: s_add_u32 s17, s22, s17 -; GFX7-NEXT: s_addc_u32 s18, s23, s18 -; GFX7-NEXT: s_mul_i32 s23, s1, s8 -; GFX7-NEXT: s_cselect_b32 s22, 1, 0 -; GFX7-NEXT: s_add_u32 s17, s23, s17 -; GFX7-NEXT: s_addc_u32 s18, s24, s18 -; GFX7-NEXT: s_mul_i32 s24, s16, s12 -; GFX7-NEXT: s_mul_i32 s26, s1, s11 +; GFX7-NEXT: s_addc_u32 s22, s23, s18 +; GFX7-NEXT: v_readfirstlane_b32 s23, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX7-NEXT: s_mul_i32 s18, s1, s8 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_add_u32 s18, s18, s17 +; GFX7-NEXT: s_addc_u32 s17, s23, s22 +; GFX7-NEXT: v_mov_b32_e32 v4, s11 +; GFX7-NEXT: v_readfirstlane_b32 s23, v3 +; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 +; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX7-NEXT: s_mul_i32 s22, s16, s12 +; GFX7-NEXT: s_mul_i32 s24, s1, s11 +; GFX7-NEXT: v_readfirstlane_b32 s28, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_readfirstlane_b32 s27, v5 +; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s27, s23 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: s_cselect_b32 s23, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s26, s24 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX7-NEXT: s_addc_u32 s25, s27, s25 ; GFX7-NEXT: s_mul_i32 s27, s2, s10 -; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s27, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX7-NEXT: s_addc_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s27, s28, s23 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 -; GFX7-NEXT: s_cselect_b32 s27, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s28, s24 +; GFX7-NEXT: s_cselect_b32 s23, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX7-NEXT: s_addc_u32 s25, s29, s25 +; GFX7-NEXT: s_addc_u32 s27, s29, s27 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 -; GFX7-NEXT: s_cselect_b32 s28, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s29, s24 +; GFX7-NEXT: s_cselect_b32 s24, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s29, s28 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX7-NEXT: s_addc_u32 s25, s30, s25 +; GFX7-NEXT: s_addc_u32 s27, s30, s27 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 ; GFX7-NEXT: s_cselect_b32 s29, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 -; GFX7-NEXT: s_addc_u32 s24, s31, s24 +; GFX7-NEXT: s_addc_u32 s28, s31, s28 ; GFX7-NEXT: s_mul_i32 s31, s1, s10 ; GFX7-NEXT: s_cselect_b32 s30, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s31, s19 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX7-NEXT: s_addc_u32 s24, s33, s24 +; GFX7-NEXT: s_addc_u32 s28, s33, s28 ; GFX7-NEXT: s_mul_i32 s33, s2, s9 ; GFX7-NEXT: s_cselect_b32 s31, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s33, s19 -; GFX7-NEXT: s_addc_u32 s24, s34, s24 +; GFX7-NEXT: s_addc_u32 s28, s34, s28 ; GFX7-NEXT: s_mul_i32 s34, s3, s8 ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 ; GFX7-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-NEXT: s_addc_u32 s24, s35, s24 +; GFX7-NEXT: s_addc_u32 s28, s35, s28 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s23, 0 -; GFX7-NEXT: s_addc_u32 s19, s22, s19 +; GFX7-NEXT: s_cmp_lg_u32 s26, 0 +; GFX7-NEXT: s_addc_u32 s19, s25, s19 ; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_cselect_b32 s22, 1, 0 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 -; GFX7-NEXT: v_readfirstlane_b32 s23, v0 +; GFX7-NEXT: v_readfirstlane_b32 s26, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX7-NEXT: s_cmp_lg_u32 s22, 0 -; GFX7-NEXT: s_addc_u32 s20, s20, s24 -; GFX7-NEXT: s_mul_i32 s22, s16, s14 -; GFX7-NEXT: s_mul_i32 s24, s1, s13 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 +; GFX7-NEXT: s_addc_u32 s20, s20, s28 +; GFX7-NEXT: s_mul_i32 s25, s16, s14 +; GFX7-NEXT: s_mul_i32 s28, s1, s13 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX7-NEXT: s_mul_i32 s24, s2, s12 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s28, s2, s12 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX7-NEXT: s_mul_i32 s24, s3, s11 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s28, s3, s11 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX7-NEXT: s_mul_i32 s24, s4, s10 -; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_mul_i32 s28, s4, s10 +; GFX7-NEXT: s_add_u32 s25, s28, s25 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX7-NEXT: s_mul_i32 s24, s5, s9 -; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_mul_i32 s28, s5, s9 +; GFX7-NEXT: s_add_u32 s25, s28, s25 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 -; GFX7-NEXT: s_mul_i32 s24, s6, s8 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s28, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 -; GFX7-NEXT: s_mul_i32 s24, s16, s13 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s28, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 -; GFX7-NEXT: s_add_u32 s24, s24, s25 +; GFX7-NEXT: s_add_u32 s27, s28, s27 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX7-NEXT: s_addc_u32 s22, s35, s22 +; GFX7-NEXT: s_addc_u32 s25, s35, s25 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s35, s24 -; GFX7-NEXT: s_addc_u32 s22, s36, s22 +; GFX7-NEXT: s_cselect_b32 s28, 1, 0 +; GFX7-NEXT: s_add_u32 s27, s35, s27 +; GFX7-NEXT: s_addc_u32 s25, s36, s25 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s36, s24 +; GFX7-NEXT: s_add_u32 s27, s36, s27 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX7-NEXT: s_addc_u32 s22, s37, s22 +; GFX7-NEXT: s_addc_u32 s25, s37, s25 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s37, s24 +; GFX7-NEXT: s_add_u32 s27, s37, s27 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX7-NEXT: s_addc_u32 s22, s38, s22 +; GFX7-NEXT: s_addc_u32 s25, s38, s25 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 -; GFX7-NEXT: s_add_u32 s24, s38, s24 -; GFX7-NEXT: s_addc_u32 s22, s39, s22 +; GFX7-NEXT: s_add_u32 s27, s38, s27 +; GFX7-NEXT: s_addc_u32 s25, s39, s25 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 -; GFX7-NEXT: s_add_u32 s24, s39, s24 -; GFX7-NEXT: s_addc_u32 s22, s40, s22 +; GFX7-NEXT: s_add_u32 s27, s39, s27 +; GFX7-NEXT: s_addc_u32 s25, s40, s25 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 @@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: s_addc_u32 s21, s30, s24 -; GFX7-NEXT: s_cselect_b32 s24, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s27, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 -; GFX7-NEXT: s_cmp_lg_u32 s28, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 -; GFX7-NEXT: s_cmp_lg_u32 s29, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 +; GFX7-NEXT: s_addc_u32 s21, s30, s27 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s23, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 -; GFX7-NEXT: s_addc_u32 s22, s26, s22 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 +; GFX7-NEXT: s_cmp_lg_u32 s29, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 +; GFX7-NEXT: s_cmp_lg_u32 s27, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, s25 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 -; GFX7-NEXT: s_addc_u32 s15, s23, s16 +; GFX7-NEXT: s_addc_u32 s15, s26, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 @@ -1033,13 +1033,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 -; GFX7-NEXT: s_cmp_lg_u32 s25, 0 +; GFX7-NEXT: s_cmp_lg_u32 s28, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 ; GFX7-NEXT: s_add_u32 s7, s7, s1 -; GFX7-NEXT: s_mov_b32 s1, s17 -; GFX7-NEXT: s_mov_b32 s2, s18 +; GFX7-NEXT: s_mov_b32 s1, s18 +; GFX7-NEXT: s_mov_b32 s2, s17 ; GFX7-NEXT: s_mov_b32 s3, s19 ; GFX7-NEXT: s_mov_b32 s4, s20 ; GFX7-NEXT: s_mov_b32 s5, s21 @@ -1059,165 +1059,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: v_readfirstlane_b32 s21, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_readfirstlane_b32 s23, v1 -; GFX8-NEXT: v_readfirstlane_b32 s19, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 ; GFX8-NEXT: s_mul_i32 s18, s16, s10 -; GFX8-NEXT: v_readfirstlane_b32 s24, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: v_readfirstlane_b32 s22, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s20, s1, s9 -; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX8-NEXT: v_readfirstlane_b32 s19, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_add_u32 s18, s20, s18 -; GFX8-NEXT: v_readfirstlane_b32 s25, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX8-NEXT: s_addc_u32 s19, s21, s19 ; GFX8-NEXT: s_mul_i32 s21, s2, s8 +; GFX8-NEXT: v_readfirstlane_b32 s23, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: v_readfirstlane_b32 s22, v3 ; GFX8-NEXT: s_add_u32 s18, s21, s18 -; GFX8-NEXT: v_readfirstlane_b32 s28, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s19, s22, s19 ; GFX8-NEXT: s_mul_i32 s22, s16, s9 -; GFX8-NEXT: v_readfirstlane_b32 s27, v5 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: s_add_u32 s17, s22, s17 -; GFX8-NEXT: s_addc_u32 s18, s23, s18 -; GFX8-NEXT: s_mul_i32 s23, s1, s8 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 -; GFX8-NEXT: s_add_u32 s17, s23, s17 -; GFX8-NEXT: s_addc_u32 s18, s24, s18 -; GFX8-NEXT: s_mul_i32 s24, s16, s12 -; GFX8-NEXT: s_mul_i32 s26, s1, s11 +; GFX8-NEXT: s_addc_u32 s22, s23, s18 +; GFX8-NEXT: v_readfirstlane_b32 s23, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX8-NEXT: s_mul_i32 s18, s1, s8 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_add_u32 s18, s18, s17 +; GFX8-NEXT: s_addc_u32 s17, s23, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s11 +; GFX8-NEXT: v_readfirstlane_b32 s23, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 +; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX8-NEXT: s_mul_i32 s22, s16, s12 +; GFX8-NEXT: s_mul_i32 s24, s1, s11 +; GFX8-NEXT: v_readfirstlane_b32 s28, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_readfirstlane_b32 s27, v5 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s27, s23 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s26, s24 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX8-NEXT: s_addc_u32 s25, s27, s25 ; GFX8-NEXT: s_mul_i32 s27, s2, s10 -; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s27, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX8-NEXT: s_addc_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s27, s28, s23 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s28, s24 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX8-NEXT: s_addc_u32 s25, s29, s25 +; GFX8-NEXT: s_addc_u32 s27, s29, s27 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s29, s24 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s29, s28 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX8-NEXT: s_addc_u32 s25, s30, s25 +; GFX8-NEXT: s_addc_u32 s27, s30, s27 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 ; GFX8-NEXT: s_cselect_b32 s29, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 -; GFX8-NEXT: s_addc_u32 s24, s31, s24 +; GFX8-NEXT: s_addc_u32 s28, s31, s28 ; GFX8-NEXT: s_mul_i32 s31, s1, s10 ; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s31, s19 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX8-NEXT: s_addc_u32 s24, s33, s24 +; GFX8-NEXT: s_addc_u32 s28, s33, s28 ; GFX8-NEXT: s_mul_i32 s33, s2, s9 ; GFX8-NEXT: s_cselect_b32 s31, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s33, s19 -; GFX8-NEXT: s_addc_u32 s24, s34, s24 +; GFX8-NEXT: s_addc_u32 s28, s34, s28 ; GFX8-NEXT: s_mul_i32 s34, s3, s8 ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: s_addc_u32 s24, s35, s24 +; GFX8-NEXT: s_addc_u32 s28, s35, s28 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_addc_u32 s19, s22, s19 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_addc_u32 s19, s25, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 -; GFX8-NEXT: v_readfirstlane_b32 s23, v0 +; GFX8-NEXT: v_readfirstlane_b32 s26, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX8-NEXT: s_cmp_lg_u32 s22, 0 -; GFX8-NEXT: s_addc_u32 s20, s20, s24 -; GFX8-NEXT: s_mul_i32 s22, s16, s14 -; GFX8-NEXT: s_mul_i32 s24, s1, s13 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 +; GFX8-NEXT: s_addc_u32 s20, s20, s28 +; GFX8-NEXT: s_mul_i32 s25, s16, s14 +; GFX8-NEXT: s_mul_i32 s28, s1, s13 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX8-NEXT: s_mul_i32 s24, s2, s12 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s28, s2, s12 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX8-NEXT: s_mul_i32 s24, s3, s11 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s28, s3, s11 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX8-NEXT: s_mul_i32 s24, s4, s10 -; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_mul_i32 s28, s4, s10 +; GFX8-NEXT: s_add_u32 s25, s28, s25 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX8-NEXT: s_mul_i32 s24, s5, s9 -; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_mul_i32 s28, s5, s9 +; GFX8-NEXT: s_add_u32 s25, s28, s25 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 -; GFX8-NEXT: s_mul_i32 s24, s6, s8 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s28, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 -; GFX8-NEXT: s_mul_i32 s24, s16, s13 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s28, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 -; GFX8-NEXT: s_add_u32 s24, s24, s25 +; GFX8-NEXT: s_add_u32 s27, s28, s27 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX8-NEXT: s_addc_u32 s22, s35, s22 +; GFX8-NEXT: s_addc_u32 s25, s35, s25 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s35, s24 -; GFX8-NEXT: s_addc_u32 s22, s36, s22 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_add_u32 s27, s35, s27 +; GFX8-NEXT: s_addc_u32 s25, s36, s25 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s36, s24 +; GFX8-NEXT: s_add_u32 s27, s36, s27 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX8-NEXT: s_addc_u32 s22, s37, s22 +; GFX8-NEXT: s_addc_u32 s25, s37, s25 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s37, s24 +; GFX8-NEXT: s_add_u32 s27, s37, s27 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX8-NEXT: s_addc_u32 s22, s38, s22 +; GFX8-NEXT: s_addc_u32 s25, s38, s25 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 -; GFX8-NEXT: s_add_u32 s24, s38, s24 -; GFX8-NEXT: s_addc_u32 s22, s39, s22 +; GFX8-NEXT: s_add_u32 s27, s38, s27 +; GFX8-NEXT: s_addc_u32 s25, s39, s25 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 -; GFX8-NEXT: s_add_u32 s24, s39, s24 -; GFX8-NEXT: s_addc_u32 s22, s40, s22 +; GFX8-NEXT: s_add_u32 s27, s39, s27 +; GFX8-NEXT: s_addc_u32 s25, s40, s25 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 @@ -1226,18 +1226,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_addc_u32 s21, s30, s24 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 +; GFX8-NEXT: s_addc_u32 s21, s30, s27 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 -; GFX8-NEXT: s_addc_u32 s22, s26, s22 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, s25 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 -; GFX8-NEXT: s_addc_u32 s15, s23, s16 +; GFX8-NEXT: s_addc_u32 s15, s26, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 @@ -1254,13 +1254,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 -; GFX8-NEXT: s_cmp_lg_u32 s25, 0 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 ; GFX8-NEXT: s_add_u32 s7, s7, s1 -; GFX8-NEXT: s_mov_b32 s1, s17 -; GFX8-NEXT: s_mov_b32 s2, s18 +; GFX8-NEXT: s_mov_b32 s1, s18 +; GFX8-NEXT: s_mov_b32 s2, s17 ; GFX8-NEXT: s_mov_b32 s3, s19 ; GFX8-NEXT: s_mov_b32 s4, s20 ; GFX8-NEXT: s_mov_b32 s5, s21 @@ -1269,9 +1269,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mul_i32 s18, s0, s10 +; GFX9-NEXT: s_mov_b32 s16, s0 +; GFX9-NEXT: s_mul_i32 s18, s16, s10 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 -; GFX9-NEXT: s_mul_hi_u32 s19, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10 ; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9 ; GFX9-NEXT: s_add_u32 s18, s20, s18 ; GFX9-NEXT: s_addc_u32 s19, s21, s19 @@ -1279,11 +1280,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8 ; GFX9-NEXT: s_add_u32 s18, s21, s18 -; GFX9-NEXT: s_mul_hi_u32 s17, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 -; GFX9-NEXT: s_mul_i32 s22, s0, s9 +; GFX9-NEXT: s_mul_i32 s22, s16, s9 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 ; GFX9-NEXT: s_addc_u32 s18, s23, s18 ; GFX9-NEXT: s_mul_i32 s23, s1, s8 @@ -1291,10 +1292,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 ; GFX9-NEXT: s_add_u32 s17, s23, s17 ; GFX9-NEXT: s_addc_u32 s18, s24, s18 -; GFX9-NEXT: s_mul_i32 s24, s0, s12 +; GFX9-NEXT: s_mul_i32 s24, s16, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s25, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 ; GFX9-NEXT: s_addc_u32 s25, s27, s25 @@ -1313,9 +1314,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8 ; GFX9-NEXT: s_add_u32 s24, s29, s24 ; GFX9-NEXT: s_addc_u32 s25, s30, s25 -; GFX9-NEXT: s_mul_i32 s30, s0, s11 +; GFX9-NEXT: s_mul_i32 s30, s16, s11 ; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s31, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11 ; GFX9-NEXT: s_add_u32 s19, s30, s19 ; GFX9-NEXT: s_addc_u32 s24, s31, s24 ; GFX9-NEXT: s_mul_i32 s31, s1, s10 @@ -1341,10 +1342,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_addc_u32 s20, s20, 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, s24 -; GFX9-NEXT: s_mul_i32 s22, s0, s14 +; GFX9-NEXT: s_mul_i32 s22, s16, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 @@ -1368,8 +1369,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 -; GFX9-NEXT: s_mul_i32 s24, s0, s13 -; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13 +; GFX9-NEXT: s_mul_i32 s24, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13 ; GFX9-NEXT: s_add_u32 s24, s24, s25 ; GFX9-NEXT: s_addc_u32 s22, s35, s22 ; GFX9-NEXT: s_mul_i32 s35, s1, s12 @@ -1414,31 +1415,30 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_mul_i32 s16, s0, s8 ; GFX9-NEXT: s_addc_u32 s22, s26, s22 -; GFX9-NEXT: s_mul_i32 s0, s0, s15 -; GFX9-NEXT: s_addc_u32 s0, s23, s0 +; GFX9-NEXT: s_mul_i32 s16, s16, s15 +; GFX9-NEXT: s_addc_u32 s15, s23, s16 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s1 +; GFX9-NEXT: s_addc_u32 s1, s15, s1 ; GFX9-NEXT: s_mul_i32 s2, s2, s13 ; GFX9-NEXT: s_cmp_lg_u32 s38, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s3, s3, s12 ; GFX9-NEXT: s_cmp_lg_u32 s37, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s11 ; GFX9-NEXT: s_cmp_lg_u32 s36, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s4 ; GFX9-NEXT: s_mul_i32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lg_u32 s35, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s5 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_mul_i32 s6, s6, s9 ; GFX9-NEXT: s_cmp_lg_u32 s25, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s6 +; GFX9-NEXT: s_addc_u32 s1, s1, s6 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s0 -; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s1 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index ff294d8378005..4248f7b6a1583 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -26,131 +26,133 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v6, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v7 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v7, vcc -; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CHECK-NEXT: v_trunc_f32_e32 v3, v2 -; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v8, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 +; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 +; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v2, vcc -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v12, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v11, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v1, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 +; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 +; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v11, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[2:3] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, v[2:3] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v4, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 +; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -378,264 +380,265 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v5, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v9 -; GISEL-NEXT: v_trunc_f32_e32 v11, v10 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 +; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v12, v9 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v15, v[9:10] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] +; GISEL-NEXT: v_mov_b32_e32 v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v11 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v15, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[10:11] -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v10, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v14, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v10 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v14 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v18, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 ; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v19, v[4:5] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 ; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v4 +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v12, v[4:5] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v4 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v11, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v8, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -659,128 +662,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v5, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v3, v2 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] -; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v17 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v5, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v5, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v18, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v1 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v3, v13 -; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 +; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, v[2:3] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v14, v[2:3] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v10, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v5, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 -; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v11, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 ; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc @@ -832,128 +835,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v7, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v7 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CGP-NEXT: v_trunc_f32_e32 v7, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v15, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v3, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v12, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v8, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 -; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc @@ -1128,173 +1131,173 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x1000 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x1000 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1302,7 +1305,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1313,13 +1316,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1341,7 +1344,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -1367,10 +1370,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: @@ -1378,8 +1381,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_movk_i32 s7, 0xf000 +; CGP-NEXT: s_movk_i32 s6, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1390,7 +1393,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1414,11 +1417,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1465,11 +1468,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -1495,10 +1498,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -1528,11 +1531,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1584,11 +1587,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1758,173 +1761,173 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1932,7 +1935,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1943,13 +1946,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1971,7 +1974,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -1997,10 +2000,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -2008,8 +2011,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -2020,7 +2023,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2044,11 +2047,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2095,11 +2098,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -2125,10 +2128,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -2158,11 +2161,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2214,11 +2217,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2276,131 +2279,131 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v8, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v7 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v8 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CHECK-NEXT: v_trunc_f32_e32 v5, v2 -; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v5 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CHECK-NEXT: v_trunc_f32_e32 v7, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v1 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v2, vcc -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v10 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v2, v10 -; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 -; CHECK-NEXT: v_mul_lo_u32 v6, v9, v5 -; CHECK-NEXT: v_xor_b32_e32 v11, v3, v10 -; CHECK-NEXT: v_mul_hi_u32 v3, v9, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v1, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v5, v[2:3] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, v[2:3] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v11, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 -; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v8 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2445,89 +2448,90 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_trunc_f32_e32 v10, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v10 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_trunc_f32_e32 v11, v9 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v11, v8 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v12, v14, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v10 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v7 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 ; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 @@ -2535,165 +2539,165 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v8, v15, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v7, v15, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v8, v16, v[1:2] ; GISEL-NEXT: v_lshl_b64 v[11:12], s[4:5], v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v14, v[9:10] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v15, v[9:10] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v9 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v6 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v12, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v10, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v11 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v12 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v12, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 +; GISEL-NEXT: v_trunc_f32_e32 v12, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v19, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v17, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v12 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v22, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v17 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v22, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v19, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v22, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v22, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v10, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v13, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v10, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 ; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v8 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v9, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v12 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v11 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v12 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc @@ -2701,9 +2705,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2729,131 +2733,131 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v10, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v3, v2 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] -; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] -; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v2 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v10, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v10 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v17, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v2 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v3, v12 -; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v12 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v11, v[2:3] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, v[2:3] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v8, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v10 -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v4, v13 +; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v3, v8 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v14, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v10 -; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v4, v9, v11, s[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v10 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v12, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2904,128 +2908,130 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v10 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v8, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] -; CGP-NEXT: v_mul_hi_u32 v9, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v10, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v10, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v3 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v11, v8 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 ; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v10 -; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v7, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v9 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v6, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -3164,7 +3170,6 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] ; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 @@ -3196,6 +3201,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 ; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 @@ -3213,163 +3219,162 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v12, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v10 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v4, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v10, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v14, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 +; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GISEL-NEXT: v_mul_hi_u32 v2, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v3, 0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v18 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v4, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v19, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v3, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v0 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 +; GISEL-NEXT: v_mov_b32_e32 v1, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v1 -; GISEL-NEXT: v_mul_hi_u32 v0, v4, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v1 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v4, v1 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v3, v0 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v4, v1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v1 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, v[0:1] -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v7, v[3:4] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 6c7e4fe3f01c7..4c444f46ff3dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1281,22 +1281,22 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 -; GFX8-NEXT: s_ashr_i32 s16, s1, 31 -; GFX8-NEXT: s_add_u32 s12, s12, s4 -; GFX8-NEXT: s_addc_u32 s13, s13, s4 -; GFX8-NEXT: s_add_u32 s0, s0, s16 -; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: s_addc_u32 s1, s1, s16 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_ashr_i32 s6, s1, 31 +; GFX8-NEXT: s_add_u32 s16, s12, s4 +; GFX8-NEXT: s_addc_u32 s17, s13, s4 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_mov_b32 s7, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s6 +; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX8-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s18, 0, s6 -; GFX8-NEXT: s_subb_u32 s19, 0, s7 +; GFX8-NEXT: s_sub_u32 s18, 0, s12 +; GFX8-NEXT: s_subb_u32 s19, 0, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1337,9 +1337,9 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX8-NEXT: s_ashr_i32 s16, s3, 31 -; GFX8-NEXT: s_mov_b32 s17, s16 +; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] +; GFX8-NEXT: s_ashr_i32 s6, s15, 31 +; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 @@ -1359,64 +1359,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s13 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_ashr_i32 s12, s15, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v6, s17 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s16, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: s_ashr_i32 s16, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s17, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s6, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s12, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s14, s12 -; GFX8-NEXT: s_addc_u32 s1, s15, s12 +; GFX8-NEXT: s_add_u32 s0, s14, s6 +; GFX8-NEXT: s_addc_u32 s1, s15, s6 ; GFX8-NEXT: s_add_u32 s2, s2, s16 +; GFX8-NEXT: s_mov_b32 s17, s16 ; GFX8-NEXT: s_addc_u32 s3, s3, s16 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s6, v8 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 @@ -1430,8 +1431,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX8-NEXT: s_mov_b32 s13, s12 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0 @@ -1504,37 +1504,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, s13, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s12, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s6, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s12, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s13, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s13, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s12, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s7, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s13, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s13, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1567,16 +1567,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s12, v6 -; GFX8-NEXT: v_xor_b32_e32 v7, s12, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_mov_b32_e32 v8, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 @@ -1593,22 +1593,22 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_ashr_i32 s16, s1, 31 -; GFX9-NEXT: s_add_u32 s12, s12, s4 -; GFX9-NEXT: s_addc_u32 s13, s13, s4 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s1, s1, s16 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_u32 s16, s12, s4 +; GFX9-NEXT: s_addc_u32 s17, s13, s4 +; GFX9-NEXT: s_add_u32 s0, s0, s6 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_addc_u32 s1, s1, s6 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s18, 0, s6 -; GFX9-NEXT: s_subb_u32 s19, 0, s7 +; GFX9-NEXT: s_sub_u32 s18, 0, s12 +; GFX9-NEXT: s_subb_u32 s19, 0, s13 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1642,16 +1642,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX9-NEXT: s_ashr_i32 s16, s3, 31 -; GFX9-NEXT: s_mov_b32 s17, s16 +; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] +; GFX9-NEXT: s_ashr_i32 s6, s15, 31 +; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1670,64 +1670,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v5, v[2:3] -; GFX9-NEXT: s_ashr_i32 s12, s15, 31 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s16, s3, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX9-NEXT: v_sub_u32_e32 v1, s17, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s12, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s14, s12 -; GFX9-NEXT: s_addc_u32 s1, s15, s12 +; GFX9-NEXT: s_add_u32 s0, s14, s6 +; GFX9-NEXT: s_addc_u32 s1, s15, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s16 +; GFX9-NEXT: s_mov_b32 s17, s16 ; GFX9-NEXT: s_addc_u32 s3, s3, s16 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s6, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s12, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 @@ -1741,29 +1742,28 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v13 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_subb_u32 s14, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, v13, v1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, v14, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 ; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v11, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v14, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v13, v2 @@ -1812,18 +1812,18 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s13, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s12, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s12, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s13, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s13, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s13, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s12, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1837,13 +1837,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v3 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s7, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] @@ -1875,16 +1875,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 ; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, s12, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s12, v8 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s12, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[8:9] ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 21fd7b594aca4..d0c55c69f5087 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -24,135 +24,135 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v6, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v5 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v8 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v8, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -476,159 +476,159 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v14, v6, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v13 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v14 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v10, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v17, v6, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v6, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v13 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v21, v6 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v12, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v19, v21, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v18, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v21, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v21, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_trunc_f32_e32 v16, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v21, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v21, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v21, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v7, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v12, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v11, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v8, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v14 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v14 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v13 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v14 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v14 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v13 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -646,131 +646,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB2_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc -; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v2, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v1 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v16 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 +; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 +; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v10, v1, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -815,131 +815,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB2_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v6, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v6, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v2 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 +; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v11, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1354,8 +1354,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_movk_i32 s7, 0xf000 +; CGP-NEXT: s_movk_i32 s6, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1366,7 +1366,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1390,11 +1390,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1442,11 +1442,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -1469,10 +1469,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -1502,12 +1502,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1558,11 +1558,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1975,8 +1975,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1987,7 +1987,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2011,11 +2011,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2063,11 +2063,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -2090,10 +2090,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -2123,12 +2123,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2179,11 +2179,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2237,135 +2237,137 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v7, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v4 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v8, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v3, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2502,168 +2504,168 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v12, v0 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v0 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v13, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v1, v6, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v0, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v7, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v11 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v10, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v16, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v10, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v11 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v6, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v18, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v17, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v20, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v10 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10] +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v16, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, v20, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v7, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v9, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v18, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v8 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v11, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, v[5:6] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v11 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2683,131 +2685,131 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc -; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v2, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v2, v11 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_xor_b32_e32 v9, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v12, v[1:2] -; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v8, v1, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v0, v3 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v0 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v11 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -2854,135 +2856,137 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v8, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v5, v3 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v5, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3166,157 +3170,157 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v10 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v4, v5, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v6 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 +; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v7, v1 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v6, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v9 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v10, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v15, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v5 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v18, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v18, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v20, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v4, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, v14, v18, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v2 -; GISEL-NEXT: v_mul_lo_u32 v7, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v2 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v2, v5, v2 -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v0, v5, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v3 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v4, v2 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v5, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v2 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v2, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v4 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v9 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index fd244d3bf2def..65455d754be4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -6251,15 +6251,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s16, s0, s8 -; GFX10-NEXT: s_subb_u32 s17, s1, s9 -; GFX10-NEXT: s_subb_u32 s18, s2, s10 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] -; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] +; GFX10-NEXT: s_sub_u32 s18, s0, s8 +; GFX10-NEXT: s_subb_u32 s19, s1, s9 +; GFX10-NEXT: s_subb_u32 s16, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] +; GFX10-NEXT: s_subb_u32 s17, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 @@ -6268,7 +6268,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: s_ashr_i32 s8, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 @@ -6304,12 +6304,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s17 +; GFX10-NEXT: v_mov_b32_e32 v3, s18 +; GFX10-NEXT: v_mov_b32_e32 v4, s19 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index df8f3a702e885..77737b356ff6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,61 +365,61 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -435,166 +435,166 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v8, s[12:13], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v17, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v20, v4 -; GISEL-NEXT: v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v12 -; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4 +; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16 +; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 @@ -602,19 +602,19 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v20, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v18, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1250,61 +1250,61 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v5 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1320,144 +1320,144 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v11, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v5, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v17, v12 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v20, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12 +; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20 +; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 ; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] @@ -1470,18 +1470,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 ; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, vcc, v3, v2, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 ; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] @@ -1489,17 +1489,17 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 56943531ba8ae..fba8ef2948ade 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1072,185 +1072,185 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v5, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v5, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v4, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v2, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v0, v3, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v7 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v4, vcc -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v14, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v6, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 +; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX8-NEXT: v_trunc_f32_e32 v14, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14 +; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v14, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3] ; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 ; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v15, v[1:2] -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v14, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1 +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 ; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v2, vcc -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v14, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, v14, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc +; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2 ; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v12, v17, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v15, v[4:5] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v6, v14, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX8-NEXT: v_mul_hi_u32 v7, v15, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v20, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v10, v19, s[0:1] ; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v11, v20, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v7, v6 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v15, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, s11, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v8, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v8, v7 +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v7 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v15, v3 +; GFX8-NEXT: v_addc_u32_e64 v4, s[0:1], v14, v4, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc +; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s11, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s14, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s11 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s15, v8, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v2 -; GFX8-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s11, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s14, v7 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v2, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3 +; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v11 +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v2, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v8, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1] +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udivrem_v2i64: @@ -1298,7 +1298,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s14 @@ -1338,181 +1337,183 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5 +; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s8, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s12, v9 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v15, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v15 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v12 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GFX9-NEXT: v_trunc_f32_e32 v15, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3] -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v12 -; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v16, v1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11 +; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v16, v1 -; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3 +; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v18, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v16, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v13, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v20, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v21, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v12, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v8 +; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v20, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v21, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 -; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v13, s11, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v8, v3 +; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v11, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v8 +; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX9-NEXT: v_add3_u32 v10, v7, v12, v13 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v11, s11 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 -; GFX9-NEXT: v_sub_u32_e32 v3, s11, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s14, v8 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v16, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v15, v6, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5 +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 +; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s14, v12, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v10, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 +; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5 +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v12 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 09e39569a5abb..097f6642cbc66 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,61 +359,61 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -429,102 +429,102 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -533,28 +533,28 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 ; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 @@ -562,11 +562,11 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 ; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 ; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 ; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] @@ -1751,63 +1751,63 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1823,102 +1823,102 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -1927,80 +1927,80 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4 +; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6 +; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 +; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8 ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index e8ceeece372d4..7c846ed551713 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -581,35 +581,35 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s5, v2 ; GFX908-NEXT: v_readfirstlane_b32 s9, v3 ; GFX908-NEXT: s_add_u32 s5, s5, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX908-NEXT: s_mul_hi_u32 s21, s2, s5 ; GFX908-NEXT: s_mul_i32 s22, s3, s5 -; GFX908-NEXT: s_mul_i32 s18, s2, s5 +; GFX908-NEXT: s_mul_i32 s20, s2, s5 ; GFX908-NEXT: s_mul_i32 s5, s2, s9 -; GFX908-NEXT: s_add_i32 s5, s19, s5 +; GFX908-NEXT: s_add_i32 s5, s21, s5 ; GFX908-NEXT: s_add_i32 s5, s5, s22 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s0 +; GFX908-NEXT: s_add_u32 s18, s18, s0 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s1 +; GFX908-NEXT: s_addc_u32 s19, s19, s1 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s18 -; GFX908-NEXT: s_addc_u32 s23, s21, s5 +; GFX908-NEXT: s_add_u32 s22, s18, s20 +; GFX908-NEXT: s_addc_u32 s23, s19, s5 ; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -657,7 +657,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 @@ -742,26 +742,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 ; GFX90A-NEXT: s_add_u32 s5, s5, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX90A-NEXT: s_mul_hi_u32 s21, s2, s5 ; GFX90A-NEXT: s_mul_i32 s22, s3, s5 -; GFX90A-NEXT: s_mul_i32 s18, s2, s5 +; GFX90A-NEXT: s_mul_i32 s20, s2, s5 ; GFX90A-NEXT: s_mul_i32 s5, s2, s9 -; GFX90A-NEXT: s_add_i32 s5, s19, s5 +; GFX90A-NEXT: s_add_i32 s5, s21, s5 ; GFX90A-NEXT: s_add_i32 s5, s5, s22 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s0 -; GFX90A-NEXT: s_addc_u32 s21, s21, s1 +; GFX90A-NEXT: s_add_u32 s18, s18, s0 +; GFX90A-NEXT: s_addc_u32 s19, s19, s1 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -769,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s18 -; GFX90A-NEXT: s_addc_u32 s23, s21, s5 +; GFX90A-NEXT: s_add_u32 s22, s18, s20 +; GFX90A-NEXT: s_addc_u32 s23, s19, s5 ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -811,7 +811,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index edab417a03ced..1b8216f4aa2a6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -9219,19 +9219,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[8:9], s[12:13], s8 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s10 -; GFX6-NEXT: s_ashr_i32 s14, s9, 31 -; GFX6-NEXT: s_add_u32 s8, s8, s14 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX6-NEXT: s_ashr_i32 s14, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s14 ; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s9, s9, s14 -; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[14:15] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s10, 0, s12 -; GFX6-NEXT: s_subb_u32 s11, 0, s13 +; GFX6-NEXT: s_addc_u32 s3, s3, s14 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s10, 0, s2 +; GFX6-NEXT: s_subb_u32 s11, 0, s3 ; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -9306,23 +9306,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -9332,22 +9332,22 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_u32 s12, s12, s4 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s3, s3, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_addc_u32 s13, s13, s4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9356,16 +9356,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s2 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9384,11 +9384,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9403,14 +9403,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9426,25 +9426,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9455,15 +9455,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -10454,17 +10454,17 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[8:9] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GFX6-NEXT: s_sub_u32 s2, 0, s16 -; GFX6-NEXT: s_subb_u32 s3, 0, s17 +; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX6-NEXT: s_sub_u32 s2, 0, s14 +; GFX6-NEXT: s_subb_u32 s3, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -10538,46 +10538,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s14, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s15, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v4 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s14, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s15, 31 -; GFX6-NEXT: s_add_u32 s2, s14, s0 +; GFX6-NEXT: s_ashr_i32 s0, s17, 31 +; GFX6-NEXT: s_add_u32 s2, s16, s0 ; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_addc_u32 s3, s15, s0 +; GFX6-NEXT: s_addc_u32 s3, s17, s0 ; GFX6-NEXT: v_mov_b32_e32 v4, s5 ; GFX6-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 ; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX6-NEXT: v_rcp_f32_e32 v4, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 9a2e7874ea1f4..5990736f664fb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 @@ -33,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -46,7 +46,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -59,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -67,7 +67,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec @@ -85,7 +85,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec @@ -130,9 +130,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -151,28 +151,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec @@ -181,15 +181,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec @@ -198,7 +198,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc @@ -208,11 +208,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec @@ -221,15 +221,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec @@ -238,15 +238,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec @@ -255,15 +255,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -272,15 +272,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec @@ -289,15 +289,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec @@ -306,15 +306,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -323,7 +323,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -331,17 +331,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr58_sgpr59, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -357,22 +357,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -406,14 +406,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -443,7 +443,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -455,16 +455,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -493,7 +493,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -504,16 +504,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -545,7 +545,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec @@ -556,16 +556,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -579,7 +579,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -599,14 +599,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -615,7 +615,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -648,7 +648,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -657,30 +657,30 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc @@ -688,7 +688,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 @@ -708,7 +708,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -737,7 +737,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -762,14 +762,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec @@ -778,8 +778,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec @@ -793,7 +793,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -826,7 +826,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -845,7 +845,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec @@ -879,16 +879,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -896,14 +896,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -912,21 +912,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec @@ -946,7 +946,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec @@ -963,14 +963,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec @@ -999,14 +999,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 5cc88343faffa..e4c7df385d861 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -189,29 +189,29 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24 -; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24 -; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24 -; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24 -; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v9, s11, s11, 8 +; SI-NEXT: v_alignbit_b32 v10, s11, s11, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, s10, 8 +; SI-NEXT: v_alignbit_b32 v12, s10, s10, 24 +; SI-NEXT: v_alignbit_b32 v13, s9, s9, 8 +; SI-NEXT: v_alignbit_b32 v14, s9, s9, 24 +; SI-NEXT: v_alignbit_b32 v15, s8, s8, 8 +; SI-NEXT: v_alignbit_b32 v16, s8, s8, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -220,8 +220,8 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v8i32: @@ -398,29 +398,29 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v4, s3, s3, 24 -; SI-NEXT: v_alignbit_b32 v5, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v6, s0, s0, 24 -; SI-NEXT: v_alignbit_b32 v7, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v8, s1, s1, 24 -; SI-NEXT: v_alignbit_b32 v9, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v10, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v11, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v12, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v13, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v14, s4, s4, 24 -; SI-NEXT: v_alignbit_b32 v15, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v16, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v9, s10, s10, 8 +; SI-NEXT: v_alignbit_b32 v10, s10, s10, 24 +; SI-NEXT: v_alignbit_b32 v11, s11, s11, 8 +; SI-NEXT: v_alignbit_b32 v12, s11, s11, 24 +; SI-NEXT: v_alignbit_b32 v13, s8, s8, 8 +; SI-NEXT: v_alignbit_b32 v14, s8, s8, 24 +; SI-NEXT: v_alignbit_b32 v15, s9, s9, 8 +; SI-NEXT: v_alignbit_b32 v16, s9, s9, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -429,8 +429,8 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 7b7a67193d226..66ba818b400b6 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -10,7 +10,7 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b64 s[4:5], 0 ; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0 ; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; ISA-NEXT: v_mov_b32_e32 v7, 0 +; ISA-NEXT: v_mov_b32_e32 v6, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_lshr_b32 s6, s5, 1 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 @@ -27,18 +27,18 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b32 s4, 0 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: v_mov_b32_e32 v6, v7 +; ISA-NEXT: v_mov_b32_e32 v7, v6 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v7, v6, v0 -; ISA-NEXT: v_add_f32_e64 v7, v7, |v3| -; ISA-NEXT: v_add_f32_e32 v7, v7, v4 -; ISA-NEXT: v_add_f32_e32 v7, v7, v5 +; ISA-NEXT: v_add_f32_e32 v6, v7, v0 +; ISA-NEXT: v_add_f32_e64 v6, v6, |v3| +; ISA-NEXT: v_add_f32_e32 v6, v6, v4 +; ISA-NEXT: v_add_f32_e32 v6, v6, v5 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 ; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; ISA-NEXT: flat_store_dword v[1:2], v6 +; ISA-NEXT: flat_store_dword v[1:2], v7 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] ; MIR-LABEL: name: f diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 2338add43d06c..2184478635e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -709,118 +709,118 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v3, v9 -; GFX9-NEXT: v_xor_b32_e32 v11, v2, v9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v10 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc -; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v8, v6 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 -; GFX9-NEXT: v_mul_lo_u32 v5, v7, v12 -; GFX9-NEXT: v_mul_hi_u32 v13, v6, v2 -; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v7, v12 -; GFX9-NEXT: v_mul_lo_u32 v5, v8, v13 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0 -; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v3, v11, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v11 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc +; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v8, 0 +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v12 +; GFX9-NEXT: v_mul_hi_u32 v13, v8, v4 +; GFX9-NEXT: v_add3_u32 v7, v5, v7, v6 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v9, v12 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v13 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v13, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v7 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2 -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_add3_u32 v1, v1, v8, v6 -; GFX9-NEXT: v_sub_u32_e32 v6, v4, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v5, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v11 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v11 +; GFX9-NEXT: v_xor_b32_e32 v8, v0, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 +; GFX9-NEXT: v_mul_hi_u32 v9, v8, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v10, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v3, v5 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v4, 0 +; GFX9-NEXT: v_add3_u32 v1, v1, v10, v9 +; GFX9-NEXT: v_sub_u32_e32 v9, v6, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v9, v2, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v16, v14, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v14, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v15, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v7, v9 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v5 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v2, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5] -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9] -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v8, v11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v8, v2, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v12, v2, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v15, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v6, v7, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v6 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v5, v6, s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 3145c1c3e868b..91fab927be3af 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -164,32 +164,32 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 0acd24dc5e3a0..26ccede123601 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -520,18 +520,18 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 1adf93c8e17a5..855b5fff11fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -876,16 +876,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 @@ -916,7 +916,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_min3_u32 v0, v0, v1, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index dbe2bba62bc9c..4202edfbd0eb4 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -1214,7 +1214,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1222,7 +1222,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1251,18 +1251,18 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v11, v2 -; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 ; SI-NEXT: v_or_b32_e32 v2, v10, v13 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -1271,7 +1271,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1279,7 +1279,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1308,13 +1308,13 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v9, v9, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB7_4: ; %exit -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 @@ -1499,13 +1499,13 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1528,30 +1528,30 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_or_b32_e32 v6, v10, v12 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_or_b32_e32 v10, v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v10, v7, v14 +; SI-NEXT: v_or_b32_e32 v11, v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB8_3 ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1562,9 +1562,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s37, s38 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1595,25 +1595,25 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v0, v9, v0 ; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v8, v6, v10 +; SI-NEXT: v_or_b32_e32 v8, v7, v10 ; SI-NEXT: v_or_b32_e32 v9, v5, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: .LBB8_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -1621,9 +1621,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 @@ -1631,11 +1631,11 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 5694a9dc1ffd4..8acc38eaf0170 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -230,6 +230,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 ; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] @@ -351,6 +353,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 ; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index e32d5d773058a..76c40f5962c58 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -294,16 +294,16 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_trunc_f32_e32 v0, s3 -; SI-NEXT: v_trunc_f32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_trunc_f32_e32 v0, s7 +; SI-NEXT: v_trunc_f32_e32 v1, s6 ; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 @@ -324,7 +324,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; SI-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; SI-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 5703f0771e96d..13e588dffaf5c 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -4316,20 +4316,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 -; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 +; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4340,8 +4340,8 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4359,7 +4359,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: @@ -4522,20 +4522,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 -; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 +; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4546,8 +4546,8 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 780d6a8680ff8..8dd73c5ab32fb 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2565,26 +2565,26 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x14 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 @@ -2636,33 +2636,33 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240 ; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220 ; GFX10-NEXT: s_waitcnt vmcnt(16) ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180 @@ -2697,7 +2697,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0x10 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 @@ -2705,47 +2705,59 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 ; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 ; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s32 offset:224 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s32 offset:240 ; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:104 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 +; GFX11-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23 +; GFX11-NEXT: v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21 ; GFX11-NEXT: s_add_i32 s1, s0, 0x110 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:88 +; GFX11-NEXT: v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19 +; GFX11-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17 +; GFX11-NEXT: v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15 +; GFX11-NEXT: v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11 +; GFX11-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9 +; GFX11-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7 +; GFX11-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:104 ; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 ; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 ; GFX11-NEXT: s_add_i32 s34, s0, 0xc0 @@ -2760,61 +2772,59 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: s_add_i32 s43, s0, 48 ; GFX11-NEXT: s_add_i32 s44, s0, 32 ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s1 +; GFX11-NEXT: scratch_store_b128 off, v[45:48], s1 ; GFX11-NEXT: s_add_i32 s1, s0, 0x100 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:108 ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s1 -; GFX11-NEXT: s_clause 0xb -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: scratch_store_b128 off, v[56:59], s1 +; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 ; GFX11-NEXT: s_add_i32 s0, s0, 16 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s1 -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[60:63], s2 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[43:46], s35 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[39:42], s36 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s39 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s40 -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s41 -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s42 -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43 -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s44 -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 +; GFX11-NEXT: scratch_store_b128 off, v[59:62], s2 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s3 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s34 +; GFX11-NEXT: scratch_store_b128 off, v[37:40], s35 +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s36 +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s37 +; GFX11-NEXT: scratch_store_b128 off, v[33:36], s38 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s39 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s40 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s41 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s42 +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s43 +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s44 +; GFX11-NEXT: scratch_store_b128 off, v[8:11], s0 ; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 ; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 ; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 ; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 @@ -2977,14 +2987,29 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:516 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:520 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:524 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:528 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:532 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:536 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:528 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:532 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:540 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -2999,30 +3024,15 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:604 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:608 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:612 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:616 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:620 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:624 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:628 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:604 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:608 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:612 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:616 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:620 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 @@ -3065,21 +3075,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX9-NEXT: v_mov_b32_e32 v2, v24 -; GFX9-NEXT: v_mov_b32_e32 v3, v25 -; GFX9-NEXT: v_mov_b32_e32 v4, v26 -; GFX9-NEXT: v_mov_b32_e32 v5, v27 -; GFX9-NEXT: v_mov_b32_e32 v6, v28 -; GFX9-NEXT: v_mov_b32_e32 v7, v29 -; GFX9-NEXT: v_mov_b32_e32 v8, v30 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -3396,8 +3398,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3407,7 +3409,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:60 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:56 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:52 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:48 @@ -3447,7 +3450,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_load_b64 s[46:47], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v32, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 @@ -3466,112 +3469,109 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 ; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v32, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 -; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 -; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_mov_b32_e32 v32, v48 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:624 +; GFX11-NEXT: scratch_load_b128 v[26:29], off, s33 offset:640 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[40:43], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[44:47], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:736 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52 +; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16 +; GFX11-NEXT: v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 -; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_mov_b32_e32 v56, v63 -; GFX11-NEXT: v_mov_b32_e32 v16, v19 -; GFX11-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v2 -; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:544 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:560 +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:576 +; GFX11-NEXT: v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29 +; GFX11-NEXT: v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55 +; GFX11-NEXT: v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v8 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v15 +; GFX11-NEXT: v_mov_b32_e32 v8, v15 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26 +; GFX11-NEXT: v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 -; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51 -; GFX11-NEXT: v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41 -; GFX11-NEXT: v_mov_b32_e32 v50, v42 -; GFX11-NEXT: v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v17 -; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[36:39], s32 +; GFX11-NEXT: v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53 +; GFX11-NEXT: v_mov_b32_e32 v39, v54 +; GFX11-NEXT: v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45 +; GFX11-NEXT: v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v9 ; GFX11-NEXT: scratch_store_b32 off, v11, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v51, v43 -; GFX11-NEXT: v_mov_b32_e32 v41, v59 +; GFX11-NEXT: v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42 +; GFX11-NEXT: v_mov_b32_e32 v52, v43 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 -; GFX11-NEXT: v_mov_b32_e32 v7, v14 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 -; GFX11-NEXT: v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61 +; GFX11-NEXT: v_mov_b32_e32 v42, v57 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 -; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12 ; GFX11-NEXT: s_add_i32 s0, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v5, v12 -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s0 +; GFX11-NEXT: v_mov_b32_e32 v43, v58 +; GFX11-NEXT: v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61 +; GFX11-NEXT: scratch_store_b128 off, v[44:47], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x6c -; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v11, v22 +; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: v_mov_b32_e32 v9, v16 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x50 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s0 +; GFX11-NEXT: v_mov_b32_e32 v11, v18 +; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47 +; GFX11-NEXT: v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 +; GFX11-NEXT: v_mov_b32_e32 v14, v21 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: v_mov_b32_e32 v16, v23 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s0 +; GFX11-NEXT: v_mov_b32_e32 v29, v33 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0xf ; GFX11-NEXT: scratch_load_b32 v63, off, s33 ; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:8 @@ -3587,10 +3587,11 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:48 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:52 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:56 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:60 +; GFX11-NEXT: v_readlane_b32 s31, v32, 1 +; GFX11-NEXT: v_readlane_b32 s30, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s45 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 3344b537ce28f..a6d56df00c862 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4706,27 +4706,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB73_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_max_i32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB73_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 @@ -5845,27 +5845,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_max_u32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 @@ -7593,27 +7593,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_min_i32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index e3e28e9486aa4..005cfe73671bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -5053,9 +5053,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -5066,8 +5066,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5075,11 +5075,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB73_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6375,9 +6375,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -6388,8 +6388,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6397,11 +6397,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -8416,9 +8416,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -8429,8 +8429,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8438,11 +8438,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 4371eb6c3ee92..e2d55990473c0 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1856,92 +1856,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v18, s3 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v12, s1 +; CI-NEXT: v_mov_b32_e32 v11, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] ; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] +; CI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; CI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_mov_b32_e32 v20, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_mov_b32_e32 v19, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 @@ -1984,41 +1984,41 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v26, v6 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v23, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index fcbb351277707..8c53d2671de3f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2712,12 +2712,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2732,12 +2732,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2751,8 +2751,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2760,38 +2760,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-NEXT: v_add_u16_e32 v3, v7, v3 +; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: @@ -2804,12 +2804,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2824,12 +2824,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2843,8 +2843,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2852,38 +2852,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX9-DL-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v7, v3 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 1eebc8e7953e3..b2ba065a079f9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -1555,15 +1555,15 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: v_readfirstlane_b32 s7, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-NEXT: v_readfirstlane_b32 s9, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] +; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB8_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 84a0cc6c9220a..f5d41b246b1b8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -109,7 +109,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 ; GFX11-NEXT: s_mul_i32 s3, s29, s28 @@ -127,26 +127,26 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s24, s2 ; GFX11-NEXT: s_lshl_b64 s[20:21], s[2:3], 1 -; GFX11-NEXT: global_load_u16 v2, v1, s[20:21] +; GFX11-NEXT: global_load_u16 v1, v2, s[20:21] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_cmp_ne_u16_e64 s2, s3, 0 -; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_and_b32 vcc_lo, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v2, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_bitcmp1_b32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s2, 0x100, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 4fa5b6cf843c1..f98b41ba199bd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2798,80 +2798,80 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_perm_b32 v0, v10, v0, s2 +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: @@ -2975,20 +2975,20 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v9, s1 -; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; CI-NEXT: s_cmp_eq_u32 s5, 15 -; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 14 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2996,109 +2996,109 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 12 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 9 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_or_b32_e32 v3, v3, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v16, v10, vcc +; CI-NEXT: v_or_b32_e32 v10, v10, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_or_b32_e32 v6, v6, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_or_b32_e32 v2, v2, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; CI-NEXT: s_cmp_eq_u32 s5, 3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 +; CI-NEXT: v_or_b32_e32 v9, v9, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v17, v10, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; CI-NEXT: v_or_b32_e32 v1, v1, v13 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; CI-NEXT: v_or_b32_e32 v8, v8, v13 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; CI-NEXT: v_or_b32_e32 v5, v5, v10 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; CI-NEXT: v_or_b32_e32 v7, v7, v13 -; CI-NEXT: v_or_b32_e32 v4, v4, v10 -; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v8 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_or_b32_e32 v3, v3, v13 +; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 035903b9b068e..288616086eb8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -863,68 +863,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v0 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v1, s1, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v1, a[0:3] +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -933,47 +933,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_add_u32_e32 v0, 0x6000, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -982,82 +982,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:24576 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1066,47 +1066,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:49152 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, 0x6000, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:57392 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1115,14 +1115,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index fa0c723c64e36..0c49338bfcab9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -707,42 +707,42 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] ; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index af12c10fec5d6..d499e017e92f4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -865,42 +865,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 2acd9c0017b09..1a51c8708b941 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -574,86 +574,86 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v12, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_mov_b32_e32 v6, s7 +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[4:5] +; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: v_mov_b32_e32 v7, s4 -; CI-NEXT: v_bfi_b32 v13, s2, v7, v6 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_mov_b32_e32 v9, s5 -; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[6:7] -; CI-NEXT: v_bfi_b32 v13, s2, v8, v9 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v10, s5 +; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 ; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] +; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v10, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: v_mov_b32_e32 v11, s11 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] +; CI-NEXT: v_mov_b32_e32 v12, s11 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v13, s2, v10, v11 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], s[14:15], -v[8:9] -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s4 +; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v14, s9 -; CI-NEXT: v_bfi_b32 v13, s2, v13, v14 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[14:15] +; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s4 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v16, s15 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 -; CI-NEXT: v_bfi_b32 v13, s2, v13, v16 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[8:9], s[18:19], -v[16:17] -; CI-NEXT: v_mov_b32_e32 v13, s4 +; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v18, s13 -; CI-NEXT: v_bfi_b32 v13, s2, v13, v18 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[18:19], s[16:17] -; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] +; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[18:19] +; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v13, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[14:15]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v20, s19 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_bfi_b32 v13, s2, v13, v20 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s4 -; CI-NEXT: v_mov_b32_e32 v16, s17 -; CI-NEXT: v_bfi_b32 v13, s2, v13, v16 -; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] +; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v18, s17 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 236faf8ecf2e8..438b1bfe319a0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1282,17 +1282,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_u32 v1, v12, 1, 1 @@ -1310,10 +1310,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_u32 v8, v12, 8, 1 ; GFX6-NEXT: v_bfe_u32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_u32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: @@ -1442,17 +1442,17 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_i32 v2, v12, 2, 1 @@ -1470,10 +1470,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_i32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_i32 v13, v12, 13, 1 ; GFX6-NEXT: v_bfe_i32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: @@ -2428,12 +2428,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s3, 24 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_lshr_b32 s6, s3, 24 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s10, s3, 1 -; GFX8-NEXT: s_and_b32 s11, s2, 1 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018 +; GFX8-NEXT: s_and_b32 s7, s3, 1 +; GFX8-NEXT: s_and_b32 s9, s2, 1 ; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011 @@ -2446,170 +2446,170 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012 ; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011 ; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x10016 +; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017 +; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016 ; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015 ; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s9 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v25, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v24, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v22, s25 ; GFX8-NEXT: v_mov_b32_e32 v23, s24 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v22, s23 ; GFX8-NEXT: v_mov_b32_e32 v23, s22 ; GFX8-NEXT: v_mov_b32_e32 v24, s21 ; GFX8-NEXT: v_mov_b32_e32 v25, s20 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v22, s19 ; GFX8-NEXT: v_mov_b32_e32 v23, s18 ; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v22, s15 ; GFX8-NEXT: v_mov_b32_e32 v23, s14 ; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, s9 +; GFX8-NEXT: v_mov_b32_e32 v25, s11 ; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 ; GFX8-NEXT: v_and_b32_e32 v21, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v27, 1, v22 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3 -; GFX8-NEXT: v_mov_b32_e32 v24, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v24, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2 ; GFX8-NEXT: v_and_b32_e32 v28, 1, v22 ; GFX8-NEXT: v_and_b32_e32 v22, 1, v20 ; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v18 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v19 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 ; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 ; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 3, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3 ; GFX8-NEXT: v_mov_b32_e32 v25, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_and_b32_sdwa v16, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v20, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s5 +; GFX8-NEXT: v_mov_b32_e32 v21, s11 ; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v20, 1, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 1, s5 -; GFX8-NEXT: s_add_u32 s8, s0, 16 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX8-NEXT: v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v20, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s6 ; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 +; GFX8-NEXT: s_add_u32 s10, s0, 16 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s5 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v15 +; GFX8-NEXT: v_mov_b32_e32 v16, s11 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v9 -; GFX8-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[14:17] -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s7 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX8-NEXT: v_mov_b32_e32 v14, s1 -; GFX8-NEXT: s_add_u32 s8, s0, 0xb0 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: v_mov_b32_e32 v13, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s7 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NEXT: s_add_u32 s10, s0, 0xb0 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 14, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GFX8-NEXT: v_and_b32_e32 v11, 1, v8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v7 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 3, s7 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s7 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_mov_b32_e32 v10, s11 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: v_and_b32_e32 v10, 1, v17 +; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v18 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 ; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s7 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20 ; GFX8-NEXT: v_and_b32_e32 v20, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 @@ -2621,30 +2621,30 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28 ; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6 ; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6 ; GFX8-NEXT: v_and_b32_e32 v16, 1, v24 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s6 +; GFX8-NEXT: v_mov_b32_e32 v12, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 0x60 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -2669,173 +2669,173 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T21.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T34.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T32.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_64 T20.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T21.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 24: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 25: -; EG-NEXT: BFE_UINT * T19.W, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T19.W, T21.X, literal.x, 1, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T19.Z, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T19.Z, T21.X, literal.x, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Y, T20.X, 1, 1, -; EG-NEXT: BFE_UINT * T21.W, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT T19.Y, T21.X, 1, 1, +; EG-NEXT: BFE_UINT * T20.W, T21.X, literal.x, 1, ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T19.X, T20.X, 1, -; EG-NEXT: BFE_UINT T21.Z, T20.X, literal.x, 1, +; EG-NEXT: AND_INT T19.X, T21.X, 1, +; EG-NEXT: BFE_UINT T20.Z, T21.X, literal.x, 1, ; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45) -; EG-NEXT: BFE_UINT T21.Y, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT * T23.W, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T20.Y, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T23.W, T21.X, literal.y, 1, ; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T21.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T23.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T20.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T23.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T24.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T23.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T25.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T23.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T25.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T23.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T25.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T23.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T25.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T26.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T25.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T27.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T25.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T25.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T27.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T25.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T27.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T27.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T29.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T27.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T27.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T29.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T27.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T29.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T29.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T31.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T29.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T29.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T31.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T29.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T31.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T32.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T31.Y, T20.X, literal.y, 1, -; EG-NEXT: LSHR * T33.W, T20.X, literal.z, +; EG-NEXT: BFE_UINT T31.Y, T21.X, literal.y, 1, +; EG-NEXT: LSHR * T33.W, T21.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T31.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T33.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T31.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T33.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T34.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T33.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T35.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T33.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T35.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T33.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T35.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T33.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T35.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 28(3.923636e-44), 2(2.802597e-45) ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T35.Y, T20.Y, 1, 1, -; EG-NEXT: BFE_UINT T36.W, T20.Y, literal.y, 1, -; EG-NEXT: AND_INT * T35.X, T20.Y, 1, +; EG-NEXT: LSHR T21.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T35.Y, T21.Y, 1, 1, +; EG-NEXT: BFE_UINT T36.W, T21.Y, literal.y, 1, +; EG-NEXT: AND_INT * T35.X, T21.Y, 1, ; EG-NEXT: 2(2.802597e-45), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T36.Z, T20.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T36.Z, T21.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 128(1.793662e-43) ; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T36.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T38.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T36.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 5(7.006492e-45) ; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T38.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T36.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T38.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 122: ; EG-NEXT: LSHR T39.X, T0.W, literal.x, -; EG-NEXT: BFE_UINT T38.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T40.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T38.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T38.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T40.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T38.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T40.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T40.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T42.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T40.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T40.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T42.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T40.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T42.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T43.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T42.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T44.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T42.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T44.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T42.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T44.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T42.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T44.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T45.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T44.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T46.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T44.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T46.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T44.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T46.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T46.Y, T20.Y, literal.y, 1, -; EG-NEXT: LSHR * T48.W, T20.Y, literal.z, +; EG-NEXT: BFE_UINT T46.Y, T21.Y, literal.y, 1, +; EG-NEXT: LSHR * T48.W, T21.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T46.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T48.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T46.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T49.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T48.Y, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Y, T21.Y, literal.y, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T48.X, T20.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T21.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, @@ -3031,15 +3031,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2 -; GFX8-NEXT: s_lshr_b32 s6, s3, 24 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_lshr_b32 s7, s3, 24 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018 ; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10000 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10000 +; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011 @@ -3052,59 +3052,59 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012 ; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011 ; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x10017 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x10016 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016 ; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015 ; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s9 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v25, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v24, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v22, s24 ; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v22, s22 ; GFX8-NEXT: v_mov_b32_e32 v23, s21 ; GFX8-NEXT: v_mov_b32_e32 v24, s20 ; GFX8-NEXT: v_mov_b32_e32 v25, s19 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: v_mov_b32_e32 v23, s18 ; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v22, s15 ; GFX8-NEXT: v_mov_b32_e32 v23, s14 ; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] ; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s9 +; GFX8-NEXT: v_mov_b32_e32 v23, s11 ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v22, s8 +; GFX8-NEXT: v_mov_b32_e32 v22, s10 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 15, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3 @@ -3135,28 +3135,29 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_bfe_i32 v11, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, s1 +; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8 ; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8 +; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6] ; GFX8-NEXT: v_bfe_i32 v8, v11, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s8 ; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1 ; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1 @@ -3165,17 +3166,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1 ; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v25, 4, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1 ; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1 ; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1 @@ -3185,8 +3185,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3194,11 +3194,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1 ; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v18, s10 +; GFX8-NEXT: v_mov_b32_e32 v18, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3206,7 +3206,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3214,7 +3214,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v10, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4202,14 +4202,14 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 @@ -4219,8 +4219,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v13, v1 ; GFX6-NEXT: v_mov_b32_e32 v15, v1 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v14, v0, 1, 1 ; GFX6-NEXT: v_bfe_u32 v10, v0, 3, 1 @@ -4230,10 +4230,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_bfe_u32 v8, v0, 2, 1 ; GFX6-NEXT: v_bfe_u32 v4, v0, 4, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 6, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: @@ -4558,100 +4558,100 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v23, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, 0 +; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, v17 +; GFX8-NEXT: v_mov_b32_e32 v13, v17 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, v2 -; GFX8-NEXT: v_mov_b32_e32 v17, v2 -; GFX8-NEXT: v_mov_b32_e32 v13, v2 -; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v17 +; GFX8-NEXT: v_mov_b32_e32 v5, v17 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[3:6] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 10, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 11, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 14, v2 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX8-NEXT: v_mov_b32_e32 v0, 1 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v18, 15, v2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v2 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v26, s3 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 12, v2 +; GFX8-NEXT: v_mov_b32_e32 v25, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 13, v2 +; GFX8-NEXT: v_mov_b32_e32 v20, v17 +; GFX8-NEXT: v_mov_b32_e32 v1, v17 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 0xffff, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[20:23] +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 6, v2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[19:22] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 5, v0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 3, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 4, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v10, 5, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v6 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v21, s1 ; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v22 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: v_mov_b32_e32 v20, s0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -5131,15 +5131,15 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 +; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10018 -; GFX8-NEXT: s_and_b32 s14, s2, 1 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018 +; GFX8-NEXT: s_and_b32 s11, s2, 1 ; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011 ; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010 ; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012 @@ -5148,133 +5148,128 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015 ; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016 ; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 14, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 4, s2 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 15, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0xa0 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 0x90 +; GFX8-NEXT: s_add_u32 s6, s0, 0x90 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: s_add_u32 s8, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: s_add_u32 s10, s0, 0x80 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: s_add_u32 s12, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s7 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NEXT: v_and_b32_e32 v12, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s7 -; GFX8-NEXT: v_mov_b32_e32 v21, s13 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14 +; GFX8-NEXT: v_mov_b32_e32 v25, s13 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, s12 +; GFX8-NEXT: v_mov_b32_e32 v24, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0xf0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 +; GFX8-NEXT: v_mov_b32_e32 v21, v1 +; GFX8-NEXT: v_mov_b32_e32 v23, v1 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v21, s13 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 6, s7 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v20, s12 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v25, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s14 +; GFX8-NEXT: v_mov_b32_e32 v24, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 7, s7 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v17 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s14 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_mov_b32_e32 v19, s13 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v18, s12 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v15 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GFX8-NEXT: v_and_b32_e32 v18, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX8-NEXT: v_mov_b32_e32 v23, 0 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v9 +; GFX8-NEXT: v_mov_b32_e32 v24, v1 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v15, 1 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v23 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v9, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_and_b32_sdwa v18, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 +; GFX8-NEXT: v_and_b32_sdwa v23, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v8 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v16, s3 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v15, s2 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v15, s4 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v15, s6 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s9 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v15, s8 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[23:26] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX8-NEXT: v_mov_b32_e32 v15, 0 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s14 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mov_b32_e32 v3, v15 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 +; GFX8-NEXT: v_mov_b32_e32 v3, v13 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -5282,26 +5277,29 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s7 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s14 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX8-NEXT: v_mov_b32_e32 v12, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 2, s7 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v18 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v25 -; GFX8-NEXT: v_mov_b32_e32 v3, v26 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v3, v17 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5475,98 +5473,98 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s4, 30 -; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s42, s4, 28 -; GFX6-NEXT: s_lshr_b32 s20, s4, 29 -; GFX6-NEXT: s_lshr_b32 s24, s4, 26 -; GFX6-NEXT: s_lshr_b32 s16, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s6, s4, 25 -; GFX6-NEXT: s_lshr_b32 s8, s4, 22 -; GFX6-NEXT: s_lshr_b32 s10, s4, 23 -; GFX6-NEXT: s_lshr_b32 s12, s4, 20 -; GFX6-NEXT: s_lshr_b32 s14, s4, 21 -; GFX6-NEXT: s_lshr_b32 s18, s4, 18 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NEXT: s_lshr_b32 s30, s4, 17 -; GFX6-NEXT: s_lshr_b32 s34, s4, 14 -; GFX6-NEXT: s_lshr_b32 s36, s4, 15 -; GFX6-NEXT: s_lshr_b32 s38, s4, 12 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 -; GFX6-NEXT: s_lshr_b32 s46, s4, 10 -; GFX6-NEXT: s_lshr_b32 s48, s4, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 +; GFX6-NEXT: s_lshr_b32 s52, s8, 30 +; GFX6-NEXT: s_lshr_b32 s46, s8, 31 +; GFX6-NEXT: s_lshr_b32 s48, s8, 28 +; GFX6-NEXT: s_lshr_b32 s36, s8, 29 +; GFX6-NEXT: s_lshr_b32 s38, s8, 26 +; GFX6-NEXT: s_lshr_b32 s26, s8, 27 +; GFX6-NEXT: s_lshr_b32 s28, s8, 24 +; GFX6-NEXT: s_lshr_b32 s4, s8, 25 +; GFX6-NEXT: s_lshr_b32 s6, s8, 22 +; GFX6-NEXT: s_lshr_b32 s10, s8, 23 +; GFX6-NEXT: s_lshr_b32 s12, s8, 20 +; GFX6-NEXT: s_lshr_b32 s14, s8, 21 +; GFX6-NEXT: s_lshr_b32 s16, s8, 18 +; GFX6-NEXT: s_lshr_b32 s18, s8, 19 +; GFX6-NEXT: s_lshr_b32 s20, s8, 16 +; GFX6-NEXT: s_lshr_b32 s22, s8, 17 +; GFX6-NEXT: s_lshr_b32 s24, s8, 14 +; GFX6-NEXT: s_lshr_b32 s30, s8, 15 +; GFX6-NEXT: s_lshr_b32 s34, s8, 12 +; GFX6-NEXT: s_lshr_b32 s40, s8, 13 +; GFX6-NEXT: s_lshr_b32 s42, s8, 10 +; GFX6-NEXT: s_lshr_b32 s44, s8, 11 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 8 +; GFX6-NEXT: s_lshr_b32 s50, s8, 8 ; GFX6-NEXT: v_mov_b32_e32 v2, s52 ; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 9 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s42 -; GFX6-NEXT: v_mov_b32_e32 v7, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 7 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s20 -; GFX6-NEXT: v_mov_b32_e32 v9, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 5 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s16 -; GFX6-NEXT: v_mov_b32_e32 v13, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_lshr_b32 s52, s8, 9 ; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NEXT: v_mov_b32_e32 v5, s47 +; GFX6-NEXT: s_lshr_b32 s46, s8, 6 +; GFX6-NEXT: v_mov_b32_e32 v6, s48 +; GFX6-NEXT: v_mov_b32_e32 v7, s49 +; GFX6-NEXT: s_lshr_b32 s48, s8, 7 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s8, 4 +; GFX6-NEXT: v_mov_b32_e32 v10, s38 +; GFX6-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NEXT: s_lshr_b32 s38, s8, 5 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s26 +; GFX6-NEXT: v_mov_b32_e32 v13, s27 +; GFX6-NEXT: s_lshr_b32 s26, s8, 2 +; GFX6-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NEXT: s_lshr_b32 s28, s8, 3 +; GFX6-NEXT: s_lshr_b32 s8, s8, 1 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NEXT: v_mov_b32_e32 v16, s4 +; GFX6-NEXT: v_mov_b32_e32 v17, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 @@ -5577,34 +5575,34 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: v_mov_b32_e32 v4, s30 ; GFX6-NEXT: v_mov_b32_e32 v5, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s36 -; GFX6-NEXT: v_mov_b32_e32 v5, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: v_mov_b32_e32 v4, s44 ; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NEXT: v_mov_b32_e32 v4, s48 -; GFX6-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s50 @@ -5613,26 +5611,26 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s53 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s42 -; GFX6-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5640,19 +5638,19 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s10, s4, 22 -; GFX8-NEXT: s_lshr_b32 s12, s4, 23 -; GFX8-NEXT: s_lshr_b32 s14, s4, 20 -; GFX8-NEXT: s_lshr_b32 s16, s4, 21 -; GFX8-NEXT: s_lshr_b32 s18, s4, 18 -; GFX8-NEXT: s_lshr_b32 s20, s4, 19 -; GFX8-NEXT: s_lshr_b32 s22, s4, 16 -; GFX8-NEXT: s_lshr_b32 s24, s4, 17 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_lshr_b32 s10, s8, 22 +; GFX8-NEXT: s_lshr_b32 s12, s8, 23 +; GFX8-NEXT: s_lshr_b32 s14, s8, 20 +; GFX8-NEXT: s_lshr_b32 s16, s8, 21 +; GFX8-NEXT: s_lshr_b32 s18, s8, 18 +; GFX8-NEXT: s_lshr_b32 s20, s8, 19 +; GFX8-NEXT: s_lshr_b32 s22, s8, 16 +; GFX8-NEXT: s_lshr_b32 s24, s8, 17 +; GFX8-NEXT: s_lshr_b32 s6, s8, 24 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 @@ -5693,97 +5691,101 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v16, s11 ; GFX8-NEXT: v_mov_b32_e32 v15, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s22 ; GFX8-NEXT: v_mov_b32_e32 v12, s23 ; GFX8-NEXT: v_mov_b32_e32 v13, s24 ; GFX8-NEXT: v_mov_b32_e32 v14, s25 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 5, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 5, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s8 ; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v14, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x60 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 0x60 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_mov_b32_e32 v13, s10 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v12, s5 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v11, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 0x50 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 ; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 64 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 48 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[3:6] ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s6 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[1:4] -; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: v_bfe_i32 v18, v16, 0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v16, 0, 1 ; GFX8-NEXT: v_bfe_i32 v16, v0, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v22, 0, 1 ; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX8-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, s8 -; GFX8-NEXT: v_mov_b32_e32 v17, s9 +; GFX8-NEXT: v_mov_b32_e32 v16, s4 +; GFX8-NEXT: v_mov_b32_e32 v17, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 ; GFX8-NEXT: v_bfe_i32 v14, v13, 0, 1 @@ -5795,8 +5797,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -5804,8 +5804,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 @@ -6031,50 +6029,50 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 ; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s20, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s22, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s24, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10019 -; GFX6-NEXT: s_bfe_u32 s26, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s27, s2, 0x1001d -; GFX6-NEXT: s_lshr_b32 s28, s2, 31 -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s44, s3, 31 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007 +; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009 +; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b +; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d +; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f +; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011 +; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013 +; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015 +; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017 +; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019 +; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b +; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d +; GFX6-NEXT: s_lshr_b32 s34, s2, 31 +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019 +; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d +; GFX6-NEXT: s_lshr_b32 s50, s3, 31 ; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10001 -; GFX6-NEXT: s_and_b32 s8, s2, 1 -; GFX6-NEXT: s_and_b32 s11, s3, 1 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s23, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s45, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s49, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 +; GFX6-NEXT: s_and_b32 s7, s2, 1 +; GFX6-NEXT: s_and_b32 s10, s3, 1 +; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002 +; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004 +; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006 +; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008 +; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a +; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c +; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e +; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010 +; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 +; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 +; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 +; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018 ; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a ; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c ; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e @@ -6098,138 +6096,138 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s50 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v2, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v2, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NEXT: v_mov_b32_e32 v0, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s49 -; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v0, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v25, 1 +; GFX8-NEXT: v_mov_b32_e32 v28, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6244,28 +6242,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 ; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2 -; GFX8-NEXT: s_lshr_b32 s24, s3, 24 -; GFX8-NEXT: s_lshr_b32 s22, s2, 24 +; GFX8-NEXT: s_lshr_b32 s31, s3, 24 +; GFX8-NEXT: s_lshr_b32 s24, s2, 24 ; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 ; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018 ; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s23, s3, 1 -; GFX8-NEXT: s_and_b32 s25, s2, 1 +; GFX8-NEXT: s_and_b32 s22, s3, 1 +; GFX8-NEXT: s_and_b32 s23, s2, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2 -; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10015 +; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011 +; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010 +; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012 +; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013 +; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014 +; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015 ; GFX8-NEXT: s_bfe_u32 s33, s2, 0x10016 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017 ; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011 @@ -6292,285 +6290,292 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: s_add_u32 s18, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3 ; GFX8-NEXT: s_add_u32 s42, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v23, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v24, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x170 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 ; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v24, s42 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v22 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s24 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v25, s43 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v21 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s31 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s22 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 7, s24 -; GFX8-NEXT: v_mov_b32_e32 v22, s43 +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v21 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s31 +; GFX8-NEXT: v_mov_b32_e32 v25, v1 +; GFX8-NEXT: v_mov_b32_e32 v27, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0xf0 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 7, s22 -; GFX8-NEXT: v_mov_b32_e32 v21, s43 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v20 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s24 +; GFX8-NEXT: v_mov_b32_e32 v25, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x60 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v18 ; GFX8-NEXT: v_mov_b32_e32 v18, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s43 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 0x50 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s3 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v17 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v16 ; GFX8-NEXT: v_mov_b32_e32 v16, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v17, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[2:5] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27] +; GFX8-NEXT: v_mov_b32_e32 v19, 1 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v15 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v15 ; GFX8-NEXT: v_mov_b32_e32 v15, s42 -; GFX8-NEXT: v_and_b32_sdwa v2, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_sdwa v24, v12, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v16, s43 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 48 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s3 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s43 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[2:5] +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v13 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v14 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s3 -; GFX8-NEXT: v_and_b32_e32 v26, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v10 ; GFX8-NEXT: v_mov_b32_e32 v10, s42 -; GFX8-NEXT: v_and_b32_e32 v27, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s43 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s31 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s31 ; GFX8-NEXT: s_add_u32 s42, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s24 -; GFX8-NEXT: v_and_b32_e32 v29, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v12 +; GFX8-NEXT: v_and_b32_e32 v25, 1, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v7 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 12, s3 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s31 +; GFX8-NEXT: s_add_u32 s42, s0, 0x160 +; GFX8-NEXT: v_lshrrev_b16_e64 v23, 12, s3 +; GFX8-NEXT: v_and_b32_e32 v27, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v23 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 5, s24 ; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 10, s3 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v6 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s22 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v21 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s24 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s22 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v20 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 +; GFX8-NEXT: v_and_b32_sdwa v19, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x130 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[19:22] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x120 ; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s22 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[18:21] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v19 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s3 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX8-NEXT: v_mov_b32_e32 v20, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x110 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v5 ; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s3 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v14 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s31 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: v_mov_b32_e32 v22, s5 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v0, s41 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v24 +; GFX8-NEXT: v_mov_b32_e32 v24, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v21, s4 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s7 +; GFX8-NEXT: v_mov_b32_e32 v23, s4 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s38 ; GFX8-NEXT: v_mov_b32_e32 v2, s39 -; GFX8-NEXT: v_mov_b32_e32 v21, s6 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s9 +; GFX8-NEXT: v_mov_b32_e32 v23, s6 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s9 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s37 -; GFX8-NEXT: v_mov_b32_e32 v21, s8 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s11 +; GFX8-NEXT: v_mov_b32_e32 v23, s8 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v21, s10 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s13 +; GFX8-NEXT: v_mov_b32_e32 v23, s10 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v21, s12 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NEXT: v_mov_b32_e32 v2, s31 -; GFX8-NEXT: v_mov_b32_e32 v21, s14 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s17 -; GFX8-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NEXT: v_mov_b32_e32 v2, s29 -; GFX8-NEXT: v_mov_b32_e32 v21, s16 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s19 +; GFX8-NEXT: v_mov_b32_e32 v23, s12 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v23, s14 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 -; GFX8-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NEXT: v_mov_b32_e32 v21, s18 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v23, s16 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NEXT: v_mov_b32_e32 v23, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s1 ; GFX8-NEXT: s_add_u32 s2, s0, 0x100 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s25 -; GFX8-NEXT: v_mov_b32_e32 v21, s0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v26 -; GFX8-NEXT: v_mov_b32_e32 v18, 0 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-NEXT: v_mov_b32_e32 v21, s2 +; GFX8-NEXT: v_mov_b32_e32 v23, s0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX8-NEXT: v_mov_b32_e32 v14, 0 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 -; GFX8-NEXT: v_lshrrev_b16_e64 v28, 4, s24 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v28 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v27 -; GFX8-NEXT: v_mov_b32_e32 v20, 0 -; GFX8-NEXT: v_mov_b32_e32 v18, v1 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v30, 2, s24 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s31 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v30 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v29 -; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v26 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX8-NEXT: v_mov_b32_e32 v18, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GFX8-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18] ; GFX8-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NEXT: v_mov_b32_e32 v14, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: v_mov_b32_e32 v13, s2 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, v27 +; GFX8-NEXT: v_mov_b32_e32 v3, v28 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s22 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s24 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v25, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v23 -; GFX8-NEXT: v_mov_b32_e32 v3, v25 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v3, v13 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -6582,12 +6587,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: ALU 95, @41, KC0[], KC1[] ; EG-NEXT: ALU 99, @137, KC0[CB0:0-32], KC1[] ; EG-NEXT: ALU 60, @237, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T82.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T81.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T80.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T79.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T78.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T77.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T82.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T81.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T80.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T79.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T78.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T77.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T76.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T75.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T74.X, 0 @@ -6613,149 +6618,149 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T54.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T53.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T52.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T51.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T51.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_64 T19.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T25.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 40: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 41: -; EG-NEXT: LSHR * T20.Z, T19.Y, literal.x, +; EG-NEXT: LSHR * T19.Z, T25.Y, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T20.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T20.Y, 0.0, -; EG-NEXT: BFE_UINT * T21.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T19.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T19.Y, 0.0, +; EG-NEXT: BFE_UINT * T20.Z, T25.Y, literal.y, 1, ; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T21.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T21.Y, 0.0, -; EG-NEXT: BFE_UINT * T22.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T20.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T20.Y, 0.0, +; EG-NEXT: BFE_UINT * T21.Z, T25.Y, literal.y, 1, ; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T22.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T22.Y, 0.0, -; EG-NEXT: BFE_UINT * T23.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T21.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T21.Y, 0.0, +; EG-NEXT: BFE_UINT * T22.Z, T25.Y, literal.y, 1, ; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T23.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T23.Y, 0.0, -; EG-NEXT: BFE_UINT * T24.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T22.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T22.Y, 0.0, +; EG-NEXT: BFE_UINT * T23.Z, T25.Y, literal.y, 1, ; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T24.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T24.Y, 0.0, -; EG-NEXT: BFE_UINT * T25.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T23.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T23.Y, 0.0, +; EG-NEXT: BFE_UINT * T24.Z, T25.Y, literal.y, 1, ; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T25.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T25.Y, 0.0, -; EG-NEXT: BFE_UINT * T26.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T24.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T24.Y, 0.0, +; EG-NEXT: BFE_UINT * T26.Z, T25.Y, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T26.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T26.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T26.Y, 0.0, -; EG-NEXT: BFE_UINT * T27.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.Z, T25.Y, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT T27.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T27.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T27.Y, 0.0, -; EG-NEXT: BFE_UINT * T28.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T28.Z, T25.Y, literal.y, 1, ; EG-NEXT: 16(2.242078e-44), 15(2.101948e-44) -; EG-NEXT: BFE_UINT T28.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T28.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T28.Y, 0.0, -; EG-NEXT: BFE_UINT * T29.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.Z, T25.Y, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T29.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T29.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T29.Y, 0.0, -; EG-NEXT: BFE_UINT * T30.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T30.Z, T25.Y, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T30.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T30.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T30.Y, 0.0, -; EG-NEXT: BFE_UINT * T31.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.Z, T25.Y, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T31.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T31.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T31.Y, 0.0, -; EG-NEXT: BFE_UINT * T32.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T32.Z, T25.Y, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T32.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T32.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T32.Y, 0.0, -; EG-NEXT: BFE_UINT * T33.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T33.Z, T25.Y, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T33.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T33.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T33.Y, 0.0, -; EG-NEXT: BFE_UINT * T34.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T34.Z, T25.Y, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T34.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T34.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T34.Y, 0.0, -; EG-NEXT: BFE_UINT T35.Z, T19.Y, 1, 1, -; EG-NEXT: AND_INT * T35.X, T19.Y, 1, +; EG-NEXT: BFE_UINT T35.Z, T25.Y, 1, 1, +; EG-NEXT: AND_INT * T35.X, T25.Y, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T35.Y, 0.0, -; EG-NEXT: LSHR * T36.Z, T19.X, literal.x, +; EG-NEXT: LSHR * T36.Z, T25.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T36.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T36.Y, 0.0, -; EG-NEXT: BFE_UINT * T37.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T37.Z, T25.X, literal.y, 1, ; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T37.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T37.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T37.Y, 0.0, -; EG-NEXT: BFE_UINT * T38.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.Z, T25.X, literal.y, 1, ; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T38.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T38.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T38.Y, 0.0, -; EG-NEXT: BFE_UINT * T39.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T39.Z, T25.X, literal.y, 1, ; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T39.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T39.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T39.Y, 0.0, -; EG-NEXT: BFE_UINT * T40.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.Z, T25.X, literal.y, 1, ; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T40.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T40.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T40.Y, 0.0, -; EG-NEXT: BFE_UINT * T41.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T41.Z, T25.X, literal.y, 1, ; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T41.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T41.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T41.Y, 0.0, -; EG-NEXT: BFE_UINT * T42.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.Z, T25.X, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T42.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T42.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T42.Y, 0.0, -; EG-NEXT: BFE_UINT * T43.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T43.Z, T25.X, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT * T43.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T43.X, T25.X, literal.x, 1, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 137: ; EG-NEXT: MOV T43.Y, 0.0, -; EG-NEXT: BFE_UINT * T44.Z, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T44.Z, T25.X, literal.x, 1, ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T44.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T44.Y, 0.0, -; EG-NEXT: BFE_UINT * T45.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T45.Z, T25.X, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T45.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T45.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T45.Y, 0.0, -; EG-NEXT: BFE_UINT * T46.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.Z, T25.X, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T46.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T46.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T46.Y, 0.0, -; EG-NEXT: BFE_UINT * T47.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T47.Z, T25.X, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T47.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T47.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T47.Y, 0.0, -; EG-NEXT: BFE_UINT * T48.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Z, T25.X, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T48.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T48.Y, 0.0, -; EG-NEXT: BFE_UINT * T49.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T49.Z, T25.X, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T49.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T49.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T49.Y, 0.0, -; EG-NEXT: BFE_UINT * T50.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T50.Z, T25.X, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T50.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T50.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T50.Y, 0.0, -; EG-NEXT: BFE_UINT T19.Z, T19.X, 1, 1, -; EG-NEXT: AND_INT * T19.X, T19.X, 1, +; EG-NEXT: BFE_UINT T25.Z, T25.X, 1, 1, +; EG-NEXT: AND_INT * T25.X, T25.X, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T19.Y, 0.0, -; EG-NEXT: MOV T20.W, 0.0, -; EG-NEXT: MOV * T21.W, 0.0, -; EG-NEXT: MOV T22.W, 0.0, -; EG-NEXT: MOV * T23.W, 0.0, -; EG-NEXT: MOV T24.W, 0.0, -; EG-NEXT: MOV * T25.W, 0.0, +; EG-NEXT: MOV T25.Y, 0.0, +; EG-NEXT: MOV T19.W, 0.0, +; EG-NEXT: MOV * T20.W, 0.0, +; EG-NEXT: MOV T21.W, 0.0, +; EG-NEXT: MOV * T22.W, 0.0, +; EG-NEXT: MOV T23.W, 0.0, +; EG-NEXT: MOV * T24.W, 0.0, ; EG-NEXT: MOV T26.W, 0.0, ; EG-NEXT: MOV * T27.W, 0.0, ; EG-NEXT: MOV T28.W, 0.0, @@ -6781,7 +6786,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MOV T48.W, 0.0, ; EG-NEXT: MOV * T49.W, 0.0, ; EG-NEXT: MOV T50.W, 0.0, -; EG-NEXT: MOV * T19.W, 0.0, +; EG-NEXT: MOV * T25.W, 0.0, ; EG-NEXT: LSHR T51.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -6896,11 +6901,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s48, s5, 30 ; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s42, s5, 29 -; GFX6-NEXT: s_lshr_b32 s38, s5, 26 -; GFX6-NEXT: s_lshr_b32 s44, s5, 27 +; GFX6-NEXT: s_lshr_b32 s44, s5, 29 +; GFX6-NEXT: s_lshr_b32 s40, s5, 26 +; GFX6-NEXT: s_lshr_b32 s42, s5, 27 ; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 +; GFX6-NEXT: s_lshr_b32 s38, s5, 25 ; GFX6-NEXT: s_lshr_b32 s30, s5, 22 ; GFX6-NEXT: s_lshr_b32 s34, s5, 23 ; GFX6-NEXT: s_lshr_b32 s26, s5, 20 @@ -6925,38 +6930,38 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s53 ; GFX6-NEXT: s_lshr_b32 s52, s5, 9 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v6, s48 ; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s46 -; GFX6-NEXT: v_mov_b32_e32 v11, s47 -; GFX6-NEXT: s_lshr_b32 s46, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s42 -; GFX6-NEXT: v_mov_b32_e32 v13, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_lshr_b32 s46, s5, 6 +; GFX6-NEXT: v_mov_b32_e32 v10, s54 +; GFX6-NEXT: v_mov_b32_e32 v11, s55 +; GFX6-NEXT: s_lshr_b32 s48, s5, 7 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s54, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s44 -; GFX6-NEXT: v_mov_b32_e32 v17, s45 -; GFX6-NEXT: s_lshr_b32 s38, s5, 2 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: v_mov_b32_e32 v12, s44 +; GFX6-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 4 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s40 +; GFX6-NEXT: v_mov_b32_e32 v15, s41 +; GFX6-NEXT: s_lshr_b32 s42, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v16, s54 +; GFX6-NEXT: v_mov_b32_e32 v17, s55 +; GFX6-NEXT: s_lshr_b32 s40, s5, 2 ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v9, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, s36 ; GFX6-NEXT: v_mov_b32_e32 v7, s37 ; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s40 -; GFX6-NEXT: v_mov_b32_e32 v9, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 @@ -6987,15 +6992,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v8, s24 ; GFX6-NEXT: v_mov_b32_e32 v9, s25 ; GFX6-NEXT: s_lshr_b32 s24, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v10, s18 ; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s44, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s20 -; GFX6-NEXT: v_mov_b32_e32 v13, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NEXT: v_mov_b32_e32 v13, s55 ; GFX6-NEXT: s_lshr_b32 s18, s4, 25 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 @@ -7027,50 +7032,50 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v12, s8 ; GFX6-NEXT: v_mov_b32_e32 v13, s9 ; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v14, s50 ; GFX6-NEXT: v_mov_b32_e32 v15, s51 ; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s52 +; GFX6-NEXT: v_mov_b32_e32 v17, s53 +; GFX6-NEXT: s_lshr_b32 s52, s4, 17 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v8, s46 -; GFX6-NEXT: v_mov_b32_e32 v9, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[54:55], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NEXT: s_lshr_b32 s46, s4, 14 +; GFX6-NEXT: v_mov_b32_e32 v8, s48 +; GFX6-NEXT: v_mov_b32_e32 v9, s49 +; GFX6-NEXT: s_lshr_b32 s48, s4, 15 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v10, s42 ; GFX6-NEXT: v_mov_b32_e32 v11, s43 ; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s52 -; GFX6-NEXT: v_mov_b32_e32 v13, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 13 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NEXT: v_mov_b32_e32 v13, s55 +; GFX6-NEXT: s_lshr_b32 s44, s4, 13 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v14, s40 +; GFX6-NEXT: v_mov_b32_e32 v15, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v16, s36 ; GFX6-NEXT: v_mov_b32_e32 v17, s37 ; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 @@ -7101,29 +7106,29 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v16, s24 ; GFX6-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 @@ -7159,30 +7164,30 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v1, s49 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v1, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v3, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s42 ; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 ; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v1, s39 ; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 @@ -7201,8 +7206,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_mov_b32_e32 v7, s5 @@ -7215,31 +7220,31 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_mov_b32 s13, 0 ; GFX8-NEXT: s_mov_b32 s11, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s14, s5, 22 -; GFX8-NEXT: s_lshr_b32 s16, s5, 23 -; GFX8-NEXT: s_lshr_b32 s20, s5, 20 -; GFX8-NEXT: s_lshr_b32 s22, s5, 21 -; GFX8-NEXT: s_lshr_b32 s24, s5, 18 -; GFX8-NEXT: s_lshr_b32 s26, s5, 19 -; GFX8-NEXT: s_lshr_b32 s28, s5, 16 -; GFX8-NEXT: s_lshr_b32 s30, s5, 17 -; GFX8-NEXT: s_lshr_b32 s34, s4, 22 -; GFX8-NEXT: s_lshr_b32 s36, s4, 23 -; GFX8-NEXT: s_lshr_b32 s38, s4, 20 -; GFX8-NEXT: s_lshr_b32 s40, s4, 21 -; GFX8-NEXT: s_lshr_b32 s42, s4, 18 -; GFX8-NEXT: s_lshr_b32 s44, s4, 19 -; GFX8-NEXT: s_lshr_b32 s46, s4, 16 -; GFX8-NEXT: s_lshr_b32 s48, s4, 17 -; GFX8-NEXT: s_mov_b32 s12, s5 -; GFX8-NEXT: s_lshr_b32 s10, s5, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_lshr_b32 s16, s9, 22 +; GFX8-NEXT: s_lshr_b32 s18, s9, 23 +; GFX8-NEXT: s_lshr_b32 s20, s9, 20 +; GFX8-NEXT: s_lshr_b32 s22, s9, 21 +; GFX8-NEXT: s_lshr_b32 s24, s9, 18 +; GFX8-NEXT: s_lshr_b32 s26, s9, 19 +; GFX8-NEXT: s_lshr_b32 s28, s9, 16 +; GFX8-NEXT: s_lshr_b32 s30, s9, 17 +; GFX8-NEXT: s_lshr_b32 s34, s8, 22 +; GFX8-NEXT: s_lshr_b32 s36, s8, 23 +; GFX8-NEXT: s_lshr_b32 s38, s8, 20 +; GFX8-NEXT: s_lshr_b32 s40, s8, 21 +; GFX8-NEXT: s_lshr_b32 s42, s8, 18 +; GFX8-NEXT: s_lshr_b32 s44, s8, 19 +; GFX8-NEXT: s_lshr_b32 s46, s8, 16 +; GFX8-NEXT: s_lshr_b32 s48, s8, 17 +; GFX8-NEXT: s_mov_b32 s12, s9 +; GFX8-NEXT: s_lshr_b32 s10, s9, 24 +; GFX8-NEXT: s_lshr_b32 s6, s8, 24 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -7254,165 +7259,165 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x1b0 -; GFX8-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x1a0 -; GFX8-NEXT: v_mov_b32_e32 v13, s16 -; GFX8-NEXT: v_mov_b32_e32 v14, s17 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s16 +; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0 +; GFX8-NEXT: v_mov_b32_e32 v12, s17 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_mov_b32_e32 v13, s18 +; GFX8-NEXT: v_mov_b32_e32 v14, s19 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x190 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s20 ; GFX8-NEXT: v_mov_b32_e32 v12, s21 ; GFX8-NEXT: v_mov_b32_e32 v13, s22 ; GFX8-NEXT: v_mov_b32_e32 v14, s23 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x190 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x180 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s24 ; GFX8-NEXT: v_mov_b32_e32 v12, s25 ; GFX8-NEXT: v_mov_b32_e32 v13, s26 ; GFX8-NEXT: v_mov_b32_e32 v14, s27 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x180 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0xb0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s28 ; GFX8-NEXT: v_mov_b32_e32 v12, s29 ; GFX8-NEXT: v_mov_b32_e32 v13, s30 ; GFX8-NEXT: v_mov_b32_e32 v14, s31 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0xb0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0xa0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s34 ; GFX8-NEXT: v_mov_b32_e32 v12, s35 ; GFX8-NEXT: v_mov_b32_e32 v13, s36 ; GFX8-NEXT: v_mov_b32_e32 v14, s37 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0xa0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x90 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s38 ; GFX8-NEXT: v_mov_b32_e32 v12, s39 ; GFX8-NEXT: v_mov_b32_e32 v13, s40 ; GFX8-NEXT: v_mov_b32_e32 v14, s41 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x90 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x80 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s42 ; GFX8-NEXT: v_mov_b32_e32 v12, s43 ; GFX8-NEXT: v_mov_b32_e32 v13, s44 ; GFX8-NEXT: v_mov_b32_e32 v14, s45 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x80 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x70 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s46 ; GFX8-NEXT: v_mov_b32_e32 v12, s47 ; GFX8-NEXT: v_mov_b32_e32 v13, s48 ; GFX8-NEXT: v_mov_b32_e32 v14, s49 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x70 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x60 +; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x60 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x50 +; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x50 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 64 +; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 64 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 48 +; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 48 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[3:6] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 32 +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 32 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s8 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s16 ; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s4 +; GFX8-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 16 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v18, s15 +; GFX8-NEXT: v_mov_b32_e32 v18, s17 ; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v17, s14 +; GFX8-NEXT: v_mov_b32_e32 v17, s16 ; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v18, s1 ; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NEXT: v_mov_b32_e32 v17, s0 ; GFX8-NEXT: s_add_u32 s14, s0, 0x170 ; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] @@ -7420,134 +7425,137 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v10, s15 -; GFX8-NEXT: s_add_u32 s4, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s5 +; GFX8-NEXT: s_add_u32 s8, s0, 0x160 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s9 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s5 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s9 ; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x150 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x150 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x140 ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v15, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x130 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v16, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x120 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v11, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x100 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1f0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x1f0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX8-NEXT: s_add_u32 s4, s0, 0x1e0 +; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 ; GFX8-NEXT: v_bfe_i32 v16, v17, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v22, 0, 1 ; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX8-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v16, s4 ; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v17, s5 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, s8 -; GFX8-NEXT: v_mov_b32_e32 v17, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s6 ; GFX8-NEXT: v_bfe_i32 v14, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v13, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] @@ -7558,15 +7566,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 @@ -7606,30 +7611,30 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T43.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T76.XYZW, T42.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T55.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T39.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T38.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T36.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T35.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T34.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T32.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T30.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T29.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T28.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T27.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T38.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T37.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T36.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T35.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T34.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T33.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T32.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T31.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T30.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T29.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T28.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T27.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T26.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T68.XYZW, T25.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T24.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T23.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T23.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T22.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T67.XYZW, T21.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T20.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T19.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_64 T26.XY, T26.X, 0, #1 +; EG-NEXT: VTX_READ_64 T40.XY, T26.X, 0, #1 ; EG-NEXT: ALU clause starting at 40: ; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, @@ -7655,238 +7660,238 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 63: -; EG-NEXT: LSHR T27.X, T0.W, literal.x, +; EG-NEXT: LSHR T26.X, T0.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T28.X, PV.W, literal.x, +; EG-NEXT: LSHR T27.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) -; EG-NEXT: LSHR T29.X, PV.W, literal.x, +; EG-NEXT: LSHR T28.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) -; EG-NEXT: LSHR T30.X, PV.W, literal.x, +; EG-NEXT: LSHR T29.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) -; EG-NEXT: LSHR T31.X, PV.W, literal.x, +; EG-NEXT: LSHR T30.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) -; EG-NEXT: LSHR T32.X, PV.W, literal.x, +; EG-NEXT: LSHR T31.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43) -; EG-NEXT: LSHR T33.X, PV.W, literal.x, +; EG-NEXT: LSHR T32.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43) -; EG-NEXT: LSHR T34.X, PV.W, literal.x, +; EG-NEXT: LSHR T33.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) -; EG-NEXT: LSHR T35.X, PV.W, literal.x, +; EG-NEXT: LSHR T34.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 256(3.587324e-43) -; EG-NEXT: LSHR T36.X, PV.W, literal.x, +; EG-NEXT: LSHR T35.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 272(3.811532e-43) -; EG-NEXT: LSHR T37.X, PV.W, literal.x, +; EG-NEXT: LSHR T36.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 288(4.035740e-43) -; EG-NEXT: LSHR T38.X, PV.W, literal.x, +; EG-NEXT: LSHR T37.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 304(4.259947e-43) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR T38.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 320(4.484155e-43) -; EG-NEXT: LSHR T40.X, PV.W, literal.x, +; EG-NEXT: LSHR T39.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 336(4.708363e-43) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 352(4.932571e-43) ; EG-NEXT: LSHR T42.X, PV.W, literal.x, -; EG-NEXT: LSHR T0.Z, T26.Y, literal.y, -; EG-NEXT: LSHR T0.W, T26.Y, literal.z, +; EG-NEXT: LSHR T0.Z, T40.Y, literal.y, +; EG-NEXT: LSHR T0.W, T40.Y, literal.z, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 28(3.923636e-44) ; EG-NEXT: 29(4.063766e-44), 368(5.156778e-43) ; EG-NEXT: LSHR T43.X, PS, literal.x, -; EG-NEXT: LSHR T0.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T1.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T1.W, T26.Y, literal.w, +; EG-NEXT: LSHR T0.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T1.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T1.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 25(3.503246e-44), 20(2.802597e-44) ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 384(5.380986e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T44.X, PV.W, literal.x, -; EG-NEXT: LSHR T1.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T2.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T2.W, T26.Y, literal.w, +; EG-NEXT: LSHR T1.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T2.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T2.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 16(2.242078e-44), 17(2.382207e-44) ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x, ; EG-NEXT: 400(5.605194e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T45.X, PV.W, literal.x, -; EG-NEXT: LSHR T2.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T3.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T3.W, T26.Y, literal.w, +; EG-NEXT: LSHR T2.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T3.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T3.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 12(1.681558e-44) ; EG-NEXT: 13(1.821688e-44), 8(1.121039e-44) ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x, ; EG-NEXT: 416(5.829402e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T3.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T4.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T4.W, T26.Y, literal.w, +; EG-NEXT: LSHR T3.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T4.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T4.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 4(5.605194e-45), 5(7.006492e-45) ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x, ; EG-NEXT: 432(6.053609e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, ; EG-NEXT: ADD_INT T4.Y, KC0[2].Y, literal.y, -; EG-NEXT: LSHR T5.Z, T26.Y, 1, -; EG-NEXT: LSHR T5.W, T26.X, literal.z, +; EG-NEXT: LSHR T5.Z, T40.Y, 1, +; EG-NEXT: LSHR T5.W, T40.X, literal.z, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 464(6.502025e-43) ; EG-NEXT: 28(3.923636e-44), 448(6.277817e-43) ; EG-NEXT: ALU clause starting at 153: ; EG-NEXT: LSHR T48.X, T6.W, literal.x, -; EG-NEXT: LSHR T5.Y, T26.X, literal.y, -; EG-NEXT: LSHR T6.Z, T26.X, literal.z, -; EG-NEXT: LSHR * T6.W, T26.X, literal.w, +; EG-NEXT: LSHR T5.Y, T40.X, literal.y, +; EG-NEXT: LSHR T6.Z, T40.X, literal.z, +; EG-NEXT: LSHR * T6.W, T40.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) ; EG-NEXT: 24(3.363116e-44), 25(3.503246e-44) -; EG-NEXT: LSHR * T7.W, T26.X, literal.x, +; EG-NEXT: LSHR * T7.W, T40.X, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T49.X, T26.X, 0.0, 1, -; EG-NEXT: LSHR T6.Y, T26.X, literal.x, -; EG-NEXT: ASHR T50.Z, T26.Y, literal.y, -; EG-NEXT: LSHR T8.W, T26.Y, literal.z, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.w, +; EG-NEXT: BFE_INT T49.X, T40.X, 0.0, 1, +; EG-NEXT: LSHR T6.Y, T40.X, literal.x, +; EG-NEXT: ASHR T50.Z, T40.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.z, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.w, ; EG-NEXT: 21(2.942727e-44), 31(4.344025e-44) ; EG-NEXT: 27(3.783506e-44), 30(4.203895e-44) ; EG-NEXT: BFE_INT T50.X, PS, 0.0, 1, -; EG-NEXT: LSHR T7.Y, T26.X, literal.x, +; EG-NEXT: LSHR T7.Y, T40.X, literal.x, ; EG-NEXT: BFE_INT T51.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.y, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.z, +; EG-NEXT: LSHR T8.W, T40.Y, literal.y, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 23(3.222986e-44) ; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T51.X, PS, 0.0, 1, ; EG-NEXT: MOV T50.Y, PV.X, ; EG-NEXT: BFE_INT T52.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 19(2.662467e-44), 22(3.082857e-44) ; EG-NEXT: BFE_INT T52.X, PS, 0.0, 1, ; EG-NEXT: MOV T51.Y, PV.X, ; EG-NEXT: BFE_INT T53.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 15(2.101948e-44), 18(2.522337e-44) ; EG-NEXT: BFE_INT T53.X, PS, 0.0, 1, ; EG-NEXT: MOV T52.Y, PV.X, ; EG-NEXT: BFE_INT T54.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 11(1.541428e-44), 14(1.961818e-44) ; EG-NEXT: BFE_INT T54.X, PS, 0.0, 1, ; EG-NEXT: MOV T53.Y, PV.X, ; EG-NEXT: BFE_INT T55.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 7(9.809089e-45), 10(1.401298e-44) ; EG-NEXT: BFE_INT T55.X, PS, 0.0, 1, ; EG-NEXT: MOV T54.Y, PV.X, ; EG-NEXT: BFE_INT T56.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 3(4.203895e-45), 6(8.407791e-45) ; EG-NEXT: BFE_INT T56.X, PS, 0.0, 1, ; EG-NEXT: MOV T55.Y, PV.X, ; EG-NEXT: BFE_INT T57.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.X, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.X, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 17(2.382207e-44), 2(2.802597e-45) ; EG-NEXT: BFE_INT T57.X, PS, 0.0, 1, ; EG-NEXT: MOV T56.Y, PV.X, -; EG-NEXT: ASHR T58.Z, T26.X, literal.x, -; EG-NEXT: LSHR T9.W, T26.X, literal.y, -; EG-NEXT: LSHR * T10.W, T26.X, literal.z, +; EG-NEXT: ASHR T58.Z, T40.X, literal.x, +; EG-NEXT: LSHR T9.W, T40.X, literal.y, +; EG-NEXT: LSHR * T10.W, T40.X, literal.z, ; EG-NEXT: 31(4.344025e-44), 27(3.783506e-44) ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T58.X, PS, 0.0, 1, ; EG-NEXT: MOV T57.Y, PV.X, ; EG-NEXT: BFE_INT T59.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 23(3.222986e-44), 26(3.643376e-44) ; EG-NEXT: BFE_INT T59.X, PS, 0.0, 1, ; EG-NEXT: MOV T58.Y, PV.X, ; EG-NEXT: BFE_INT T60.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 19(2.662467e-44), 22(3.082857e-44) ; EG-NEXT: BFE_INT T60.X, PS, 0.0, 1, ; EG-NEXT: MOV T59.Y, PV.X, ; EG-NEXT: BFE_INT T61.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 15(2.101948e-44), 18(2.522337e-44) ; EG-NEXT: BFE_INT T61.X, PS, 0.0, 1, ; EG-NEXT: MOV T60.Y, PV.X, ; EG-NEXT: BFE_INT T62.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 11(1.541428e-44), 14(1.961818e-44) ; EG-NEXT: BFE_INT T62.X, PS, 0.0, 1, ; EG-NEXT: MOV T61.Y, PV.X, ; EG-NEXT: BFE_INT T63.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 7(9.809089e-45), 10(1.401298e-44) ; EG-NEXT: BFE_INT T63.X, PS, 0.0, 1, ; EG-NEXT: MOV T62.Y, PV.X, ; EG-NEXT: BFE_INT T64.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR * T9.W, T26.X, literal.x, +; EG-NEXT: LSHR * T9.W, T40.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 253: -; EG-NEXT: LSHR * T10.W, T26.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.x, ; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) ; EG-NEXT: BFE_INT T64.X, PV.W, 0.0, 1, ; EG-NEXT: MOV T63.Y, T63.X, ; EG-NEXT: BFE_INT T65.Z, T9.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, 1, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T10.W, T26.X, literal.x, +; EG-NEXT: LSHR T9.W, T40.X, 1, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T10.W, T40.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: BFE_INT T65.X, PS, 0.0, 1, ; EG-NEXT: MOV T64.Y, PV.X, ; EG-NEXT: BFE_INT T49.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 12(1.681558e-44), 5(7.006492e-45) -; EG-NEXT: BFE_INT T66.X, T26.Y, 0.0, 1, +; EG-NEXT: BFE_INT T66.X, T40.Y, 0.0, 1, ; EG-NEXT: MOV T65.Y, PV.X, ; EG-NEXT: BFE_INT T67.Z, PS, 0.0, 1, -; EG-NEXT: LSHR T10.W, T26.X, literal.x, -; EG-NEXT: LSHR * T11.W, T26.X, literal.y, +; EG-NEXT: LSHR T10.W, T40.X, literal.x, +; EG-NEXT: LSHR * T11.W, T40.X, literal.y, ; EG-NEXT: 9(1.261169e-44), 4(5.605194e-45) ; EG-NEXT: BFE_INT T67.X, PS, 0.0, 1, ; EG-NEXT: MOV T49.Y, T49.X, -; EG-NEXT: BFE_INT T26.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T10.W, T26.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T11.W, T26.X, literal.y, +; EG-NEXT: BFE_INT T40.Z, PV.W, 0.0, 1, +; EG-NEXT: LSHR T10.W, T40.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T11.W, T40.X, literal.y, ; EG-NEXT: 13(1.821688e-44), 8(1.121039e-44) -; EG-NEXT: BFE_INT T26.X, PS, 0.0, 1, +; EG-NEXT: BFE_INT T40.X, PS, 0.0, 1, ; EG-NEXT: MOV T67.Y, PV.X, ; EG-NEXT: BFE_INT T68.Z, PV.W, 0.0, 1, ; EG-NEXT: MOV T49.W, T49.Z, ; EG-NEXT: MOV * T65.W, T65.Z, ; EG-NEXT: BFE_INT T68.X, T9.W, 0.0, 1, -; EG-NEXT: MOV T26.Y, PV.X, +; EG-NEXT: MOV T40.Y, PV.X, ; EG-NEXT: BFE_INT T69.Z, T8.W, 0.0, 1, BS:VEC_120/SCL_212 ; EG-NEXT: MOV T67.W, T67.Z, ; EG-NEXT: MOV * T64.W, T64.Z, ; EG-NEXT: BFE_INT T69.X, T7.Y, 0.0, 1, ; EG-NEXT: MOV T68.Y, PV.X, ; EG-NEXT: BFE_INT T70.Z, T6.Y, 0.0, 1, BS:VEC_120/SCL_212 -; EG-NEXT: MOV T26.W, T26.Z, +; EG-NEXT: MOV T40.W, T40.Z, ; EG-NEXT: MOV * T63.W, T63.Z, ; EG-NEXT: BFE_INT T70.X, T7.W, 0.0, 1, ; EG-NEXT: MOV T69.Y, PV.X, diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 7d98459704918..bee3d455187ca 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2976,22 +2976,22 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s3, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -3086,50 +3086,50 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3152,10 +3152,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff @@ -3163,10 +3163,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff @@ -3284,18 +3284,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -3303,23 +3303,23 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 @@ -3585,42 +3585,42 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T36.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T38.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T41.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 0, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 48, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 32, #1 +; EG-NEXT: VTX_READ_128 T41.XYZW, T37.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T37.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T37.W, T36.Y, literal.x, +; EG-NEXT: LSHR * T35.W, T38.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T37.Z, T36.Y, literal.x, +; EG-NEXT: AND_INT * T35.Z, T38.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T37.Y, T36.X, literal.x, -; EG-NEXT: LSHR * T38.W, T36.W, literal.x, +; EG-NEXT: LSHR T35.Y, T38.X, literal.x, +; EG-NEXT: LSHR * T36.W, T38.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T36.X, literal.x, -; EG-NEXT: AND_INT T38.Z, T36.W, literal.x, -; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T35.X, T38.X, literal.x, +; EG-NEXT: AND_INT T36.Z, T38.W, literal.x, +; EG-NEXT: LSHR * T38.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) -; EG-NEXT: LSHR T38.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T36.Y, T38.Z, literal.x, ; EG-NEXT: LSHR * T42.W, T41.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T38.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T36.X, T38.Z, literal.x, ; EG-NEXT: AND_INT T42.Z, T41.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) @@ -3657,16 +3657,16 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T39.Y, literal.y, +; EG-NEXT: LSHR * T37.W, T39.Y, literal.y, ; EG-NEXT: 80(1.121039e-43), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T39.Y, literal.y, +; EG-NEXT: AND_INT * T37.Z, T39.Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 95: -; EG-NEXT: LSHR T35.Y, T39.X, literal.x, +; EG-NEXT: LSHR T37.Y, T39.X, literal.x, ; EG-NEXT: LSHR * T53.W, T39.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T39.X, literal.x, +; EG-NEXT: AND_INT T37.X, T39.X, literal.x, ; EG-NEXT: AND_INT T53.Z, T39.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) @@ -3762,167 +3762,167 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s35, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s0 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s4, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s6, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s12, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s14, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s17, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s16, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s21, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s20, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s21, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s20, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s20 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s23, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s22, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s22, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s24, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s25 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s24, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s27, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s26, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s27, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s26, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s27 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s29, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s28, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s29, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s28, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s29, s29 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s28, s28 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s31, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s30, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s31, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s30, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s53, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s54, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s57, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s58, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s6, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s8, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s9 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -5640,61 +5640,61 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: @@ -6920,133 +6920,133 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 @@ -7066,21 +7066,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s42, s15 -; GCN-HSA-NEXT: s_mov_b32 s44, s13 -; GCN-HSA-NEXT: s_mov_b32 s46, s11 -; GCN-HSA-NEXT: s_mov_b32 s48, s9 -; GCN-HSA-NEXT: s_mov_b32 s50, s7 -; GCN-HSA-NEXT: s_mov_b32 s52, s5 -; GCN-HSA-NEXT: s_mov_b32 s54, s3 -; GCN-HSA-NEXT: s_mov_b32 s56, s1 -; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s13 +; GCN-HSA-NEXT: s_mov_b32 s50, s11 +; GCN-HSA-NEXT: s_mov_b32 s52, s9 +; GCN-HSA-NEXT: s_mov_b32 s54, s7 +; GCN-HSA-NEXT: s_mov_b32 s56, s5 +; GCN-HSA-NEXT: s_mov_b32 s46, s3 +; GCN-HSA-NEXT: s_mov_b32 s58, s1 +; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 @@ -7094,7 +7094,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -7102,44 +7102,44 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 @@ -7148,34 +7148,34 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 ; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index d2ea7a95473e0..8791384101218 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -746,29 +746,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 -; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 -; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 +; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 @@ -779,29 +779,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 @@ -2011,37 +2011,39 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 48 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 7c3a5db46a8d8..66fc322e5e04b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2808,33 +2808,33 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s8, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s51, s0, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s0, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s55, s2, 0xff @@ -2870,76 +2870,76 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -2956,9 +2956,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm @@ -2981,42 +2981,42 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s29, s4, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s35, s6, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s7, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s37, s7, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s39, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s41, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s43, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s45, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s47, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s49, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s51, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s53, s15, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s30, s2, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s54, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s55, s4, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s56, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s57, s5, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s58, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s60, s7, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff @@ -3033,118 +3033,119 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s50 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s38 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 @@ -3214,8 +3215,8 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s28, s4, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 -; GFX8-NOHSA-NEXT: s_bfe_u32 s41, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s5, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s41, s5, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s42, s5, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s43, s6, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s44, s6, 0x80010 @@ -3327,28 +3328,27 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] @@ -3398,96 +3398,96 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T35.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T21.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1 -; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 0, #1 +; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1 +; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1 ; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_128 T32.XYZW, T19.X, 48, #1 -; EG-NEXT: VTX_READ_128 T33.XYZW, T19.X, 32, #1 +; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1 +; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1 ; EG-NEXT: ALU clause starting at 30: -; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: MOV * T21.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 31: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T22.Z, T21.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T22.Y, T21.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T23.Z, T21.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T22.W, T21.X, literal.z, +; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T19.W, T23.X, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T22.X, T21.X, literal.x, -; EG-NEXT: BFE_UINT T23.Y, T21.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.z, +; EG-NEXT: AND_INT T19.X, T23.X, literal.x, +; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T24.Z, T21.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T23.W, T21.Y, literal.y, +; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W, +; EG-NEXT: LSHR * T20.W, T23.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T23.X, T21.Y, literal.x, -; EG-NEXT: BFE_UINT T24.Y, T21.Z, literal.y, T0.W, +; EG-NEXT: AND_INT T20.X, T23.Y, literal.x, +; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T25.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T26.Z, T21.W, literal.y, T0.W, -; EG-NEXT: LSHR T24.W, T21.Z, literal.z, -; EG-NEXT: AND_INT * T24.X, T21.Z, literal.w, +; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W, +; EG-NEXT: LSHR T24.W, T23.Z, literal.z, +; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T26.Y, T21.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) ; EG-NEXT: LSHR T27.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T28.Z, T20.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T26.W, T21.W, literal.z, -; EG-NEXT: AND_INT * T26.X, T21.W, literal.w, +; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T26.W, T23.W, literal.z, +; EG-NEXT: AND_INT * T26.X, T23.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T28.Y, T20.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) ; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T30.Z, T20.Y, literal.y, T0.W, -; EG-NEXT: LSHR T28.W, T20.X, literal.z, -; EG-NEXT: AND_INT * T28.X, T20.X, literal.w, +; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W, +; EG-NEXT: LSHR T28.W, T22.X, literal.z, +; EG-NEXT: AND_INT * T28.X, T22.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T30.Y, T20.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: LSHR T30.W, T20.Y, literal.y, -; EG-NEXT: AND_INT * T30.X, T20.Y, literal.z, +; EG-NEXT: LSHR T22.X, PV.W, literal.x, +; EG-NEXT: LSHR T30.W, T22.Y, literal.y, +; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Z, T20.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) ; EG-NEXT: LSHR T31.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T19.Y, T20.Z, literal.y, T0.W, +; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: ALU clause starting at 91: -; EG-NEXT: BFE_UINT T34.Z, T20.W, literal.x, T0.W, -; EG-NEXT: LSHR * T19.W, T20.Z, literal.y, +; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W, +; EG-NEXT: LSHR * T21.W, T22.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T19.X, T20.Z, literal.x, -; EG-NEXT: BFE_UINT T34.Y, T20.W, literal.y, T0.W, +; EG-NEXT: AND_INT T21.X, T22.Z, literal.x, +; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T35.X, PV.W, literal.x, ; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T34.W, T20.W, literal.z, -; EG-NEXT: AND_INT * T34.X, T20.W, literal.w, +; EG-NEXT: LSHR T34.W, T22.W, literal.z, +; EG-NEXT: AND_INT * T34.X, T22.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) ; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W, @@ -3767,15 +3767,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s43, s7, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s8, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s46, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s47, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s48, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s50, s9, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s44, s8, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s45, s8, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s47, s9, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s48, s9, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s52, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s53, s10, 0x80008 @@ -3794,51 +3794,49 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s66, s15, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s67, s15, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s68, s15, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s44, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GFX7-HSA-NEXT: s_sext_i32_i8 s46, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 -; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 @@ -3856,45 +3854,47 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s45 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -3960,7 +3960,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v17, 8, s14 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s14 ; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s0, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s0, 0x80010 ; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s1, 24 @@ -3996,54 +3996,54 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s15, 0x80010 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s15 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v5, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s50 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v5, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s50 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s12 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v17, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s12 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v18, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s12 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s13 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xd0 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v18, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s45 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v19, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s45 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v19, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s43 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v20, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s43 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s11 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s8 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v20, 0, 8 +; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v11, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s41 @@ -5848,13 +5848,13 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -5863,14 +5863,14 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 @@ -5899,11 +5899,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -6820,71 +6820,72 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s48, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1 ; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s60, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s5 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[60:61], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 @@ -6895,78 +6896,81 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:176 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[8:11], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:64 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s47 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -6976,17 +6980,17 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s42, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s54, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s50, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s62, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 @@ -6994,29 +6998,29 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s62, s1, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 ; GFX7-HSA-NEXT: s_mov_b32 s16, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[4:5], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 @@ -7024,6 +7028,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 @@ -7031,73 +7039,69 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 +; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40 ; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] @@ -9626,25 +9630,25 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: ALU 103, @16, KC0[], KC1[] ; EG-NEXT: ALU 104, @120, KC0[], KC1[] ; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 ; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.Y, T16.X, ; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: AND_INT T0.W, T36.X, literal.x, +; EG-NEXT: AND_INT T0.W, T37.X, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, ; EG-NEXT: 255(3.573311e-43), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: MOV * T16.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T36.X, literal.x, +; EG-NEXT: LSHL * T0.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, @@ -9654,27 +9658,27 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T36.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, ; EG-NEXT: MOV * T17.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.X, literal.x, +; EG-NEXT: LSHR * T1.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.Y, PV.W, PS, +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, ; EG-NEXT: MOV T17.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T12.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.Y, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T12.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9682,28 +9686,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T12.X, PV.W, ; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T36.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T13.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: OR_INT * T36.W, PV.W, PS, ; EG-NEXT: MOV T13.X, PV.W, ; EG-NEXT: MOV * T0.Y, T8.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.Z, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T8.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.Z, literal.x, +; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9711,28 +9715,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T8.X, PV.W, ; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T36.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T9.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.Z, literal.x, +; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: OR_INT * T37.Y, PV.W, PS, ; EG-NEXT: MOV T9.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T4.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.W, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T4.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.W, literal.x, +; EG-NEXT: LSHL * T1.W, T37.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9740,7 +9744,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T4.X, PV.W, ; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T1.W, T36.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 120: ; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, @@ -9748,12 +9752,12 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T5.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.W, literal.x, +; EG-NEXT: LSHR * T1.W, T37.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: OR_INT * T37.W, PV.W, PS, ; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T0.Y, T32.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, @@ -9883,10 +9887,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T37.X, T16.X, -; EG-NEXT: MOV * T37.Z, T12.X, -; EG-NEXT: MOV T36.X, T8.X, -; EG-NEXT: MOV T36.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 ; EG-NEXT: MOV * T38.X, T32.X, ; EG-NEXT: MOV * T38.Z, T28.X, ; EG-NEXT: MOV T35.X, T24.X, diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 11c8fb2422ed0..e89c44d5b94a8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -622,32 +622,32 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, s11 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:8 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:10 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:12 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:14 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:18 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:20 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[4:7], 0 offset:22 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:24 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:26 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:28 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:30 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s7 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:6 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[8:11], 0 offset:8 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:10 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:12 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[8:11], 0 offset:14 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[8:11], 0 offset:18 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[8:11], 0 offset:20 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[8:11], 0 offset:22 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[8:11], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[8:11], 0 offset:26 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[8:11], 0 offset:28 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[8:11], 0 offset:30 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 @@ -666,8 +666,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_load_v16i16_align2: @@ -2660,27 +2660,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 @@ -2689,64 +2689,64 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -3068,22 +3068,22 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 @@ -3101,63 +3101,63 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3460,103 +3460,115 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, 0xffff, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v24 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v61, 0xffff, v32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v59, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v38 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload @@ -3590,10 +3602,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[26:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 @@ -3604,71 +3616,71 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v20 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v36, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[34:37] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v19 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 @@ -3686,7 +3698,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[16:19] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 @@ -3695,52 +3707,52 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v24 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[27:30] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -3772,102 +3784,112 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v20 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v24 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -3892,35 +3914,35 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 48, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 32, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T37.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T37.W, T36.W, literal.x, +; EG-NEXT: LSHR * T35.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T37.Z, T36.W, literal.x, +; EG-NEXT: AND_INT * T35.Z, T36.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T37.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T35.Y, T36.Z, literal.x, ; EG-NEXT: LSHR * T36.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T35.X, T36.Z, literal.x, ; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) @@ -3965,16 +3987,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T38.W, literal.y, +; EG-NEXT: LSHR * T37.W, T38.W, literal.y, ; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T38.W, literal.y, +; EG-NEXT: AND_INT * T37.Z, T38.W, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 96: -; EG-NEXT: LSHR T35.Y, T38.Z, literal.x, +; EG-NEXT: LSHR T37.Y, T38.Z, literal.x, ; EG-NEXT: LSHR * T38.W, T38.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T38.Z, literal.x, +; EG-NEXT: AND_INT T37.X, T38.Z, literal.x, ; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) @@ -4253,14 +4275,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 @@ -4275,76 +4297,76 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v34, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v33 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v37 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v36, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v43 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v42 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v43, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v42, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v41 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v16, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload @@ -4386,14 +4408,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[24:25] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -4404,27 +4426,27 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v34, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v32, v20, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[28:31] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v34, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v32, v22, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) @@ -4442,9 +4464,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[19:22] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) @@ -4458,7 +4480,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 @@ -4470,7 +4492,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 @@ -4500,38 +4522,38 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v15, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v21, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v25, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v23, v32, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v28, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v27 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v26 +; GCN-HSA-NEXT: v_bfe_i32 v2, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v26, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v27, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v24, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[25:28] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v29 +; GCN-HSA-NEXT: v_bfe_i32 v21, v29, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v27 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v17, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v30, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] @@ -4561,102 +4583,112 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v28 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v29, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v28, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v20, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v25 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v24 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -4878,7 +4910,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[] ; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T36.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T35.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T55.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X @@ -4896,8 +4928,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 16, #1 -; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 +; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 16, #1 +; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 0, #1 ; CM-NEXT: Fetch clause starting at 28: ; CM-NEXT: VTX_READ_128 T41.XYZW, T37.X, 112, #1 ; CM-NEXT: VTX_READ_128 T42.XYZW, T37.X, 96, #1 @@ -4914,22 +4946,22 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T39.X, PV.W, literal.x, -; CM-NEXT: LSHR T0.Y, T36.Z, literal.y, -; CM-NEXT: LSHR T0.Z, T36.W, literal.y, +; CM-NEXT: LSHR T0.Y, T35.Z, literal.y, +; CM-NEXT: LSHR T0.Z, T35.W, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T40.X, PV.W, literal.x, -; CM-NEXT: LSHR T1.Y, T36.Y, literal.y, -; CM-NEXT: LSHR T1.Z, T35.Z, literal.y, -; CM-NEXT: LSHR * T0.W, T35.W, literal.y, +; CM-NEXT: LSHR T1.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T1.Z, T36.Z, literal.y, +; CM-NEXT: LSHR * T0.W, T36.W, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 57: -; CM-NEXT: LSHR T2.Z, T35.X, literal.x, +; CM-NEXT: LSHR T2.Z, T36.X, literal.x, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43) ; CM-NEXT: LSHR T46.X, PV.W, literal.x, -; CM-NEXT: LSHR T2.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T2.Y, T36.Y, literal.y, ; CM-NEXT: LSHR T3.Z, T37.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5051,31 +5083,31 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: BFE_INT T63.X, T37.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T37.Z, T35.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.Z, T36.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T37.X, T35.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.X, T36.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T64.Z, T35.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T64.X, T35.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T36.Z, T35.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T36.X, T35.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T37.Y, T2.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T65.Z, T36.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.Z, T35.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T65.X, T36.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.X, T35.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR T1.Z, T36.X, literal.x, -; CM-NEXT: BFE_INT * T35.W, T1.Y, 0.0, literal.x, +; CM-NEXT: LSHR T1.Z, T35.X, literal.x, +; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T36.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT T35.Y, PV.Z, 0.0, literal.y, +; CM-NEXT: LSHR T35.X, KC0[2].Y, literal.x, +; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, ; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5987,14 +6019,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6004,8 +6036,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 @@ -6015,10 +6047,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: @@ -6872,12 +6904,12 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 @@ -6909,48 +6941,48 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v16, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 -; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 +; GCN-HSA-NEXT: v_bfe_i32 v6, v16, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_bfe_i32 v4, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; @@ -6964,62 +6996,62 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i64: @@ -7213,139 +7245,113 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[2:5], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[19:22], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[25:28], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v62, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v26 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v32 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v24 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v63, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v60, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v63, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(4) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -7516,95 +7522,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v58, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v55 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v56 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v56 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v55 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -7777,117 +7783,117 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; CM-NEXT: ALU 33, @31, KC0[], KC1[] ; CM-NEXT: TEX 0 @28 ; CM-NEXT: ALU 94, @65, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T50.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T50.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T49.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T48.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T47.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T46.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T46.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T45.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T44.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T43.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T42.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T42.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T41.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T40.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T39.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T38.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T38.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T37.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T36.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T22.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T23.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 22: -; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1 -; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 32, #1 -; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1 +; CM-NEXT: VTX_READ_128 T21.XYZW, T20.X, 0, #1 +; CM-NEXT: VTX_READ_128 T22.XYZW, T20.X, 32, #1 +; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 16, #1 ; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1 +; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 48, #1 ; CM-NEXT: ALU clause starting at 30: -; CM-NEXT: MOV * T19.X, KC0[2].Z, +; CM-NEXT: MOV * T20.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 31: -; CM-NEXT: LSHR * T23.Z, T20.Y, literal.x, +; CM-NEXT: LSHR * T19.Z, T21.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T23.X, T20.Y, literal.x, -; CM-NEXT: MOV T23.Y, 0.0, -; CM-NEXT: LSHR * T24.Z, T20.X, literal.y, +; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, +; CM-NEXT: MOV T19.Y, 0.0, +; CM-NEXT: LSHR * T24.Z, T21.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T24.X, T20.X, literal.x, +; CM-NEXT: AND_INT T24.X, T21.X, literal.x, ; CM-NEXT: MOV T24.Y, 0.0, -; CM-NEXT: LSHR * T25.Z, T20.W, literal.y, +; CM-NEXT: LSHR * T25.Z, T21.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T25.X, T20.W, literal.x, +; CM-NEXT: AND_INT T25.X, T21.W, literal.x, ; CM-NEXT: MOV T25.Y, 0.0, -; CM-NEXT: LSHR * T26.Z, T20.Z, literal.y, +; CM-NEXT: LSHR * T26.Z, T21.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T26.X, T20.Z, literal.x, +; CM-NEXT: AND_INT T26.X, T21.Z, literal.x, ; CM-NEXT: MOV T26.Y, 0.0, -; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, +; CM-NEXT: LSHR * T21.Z, T23.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, -; CM-NEXT: MOV T20.Y, 0.0, -; CM-NEXT: LSHR * T27.Z, T22.X, literal.y, +; CM-NEXT: AND_INT T21.X, T23.Y, literal.x, +; CM-NEXT: MOV T21.Y, 0.0, +; CM-NEXT: LSHR * T27.Z, T23.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T27.X, T22.X, literal.x, +; CM-NEXT: AND_INT T27.X, T23.X, literal.x, ; CM-NEXT: MOV T27.Y, 0.0, -; CM-NEXT: LSHR * T28.Z, T22.W, literal.y, +; CM-NEXT: LSHR * T28.Z, T23.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T28.X, T22.W, literal.x, +; CM-NEXT: AND_INT T28.X, T23.W, literal.x, ; CM-NEXT: MOV T28.Y, 0.0, -; CM-NEXT: LSHR * T29.Z, T22.Z, literal.y, +; CM-NEXT: LSHR * T29.Z, T23.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T29.X, T22.Z, literal.x, +; CM-NEXT: AND_INT T29.X, T23.Z, literal.x, ; CM-NEXT: MOV T29.Y, 0.0, -; CM-NEXT: LSHR * T19.Z, T21.Y, literal.y, +; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 65: -; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, -; CM-NEXT: MOV T19.Y, 0.0, -; CM-NEXT: LSHR * T30.Z, T21.X, literal.y, +; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, +; CM-NEXT: MOV T20.Y, 0.0, +; CM-NEXT: LSHR * T30.Z, T22.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T30.X, T21.X, literal.x, +; CM-NEXT: AND_INT T30.X, T22.X, literal.x, ; CM-NEXT: MOV T30.Y, 0.0, -; CM-NEXT: LSHR * T31.Z, T21.W, literal.y, +; CM-NEXT: LSHR * T31.Z, T22.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T31.X, T21.W, literal.x, +; CM-NEXT: AND_INT T31.X, T22.W, literal.x, ; CM-NEXT: MOV T31.Y, 0.0, -; CM-NEXT: LSHR * T32.Z, T21.Z, literal.y, +; CM-NEXT: LSHR * T32.Z, T22.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T32.X, T21.Z, literal.x, +; CM-NEXT: AND_INT T32.X, T22.Z, literal.x, ; CM-NEXT: MOV T32.Y, 0.0, -; CM-NEXT: LSHR * T21.Z, T22.Y, literal.y, +; CM-NEXT: LSHR * T22.Z, T23.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T21.X, T22.Y, literal.x, -; CM-NEXT: MOV T21.Y, 0.0, -; CM-NEXT: LSHR * T33.Z, T22.X, literal.y, +; CM-NEXT: AND_INT T22.X, T23.Y, literal.x, +; CM-NEXT: MOV T22.Y, 0.0, +; CM-NEXT: LSHR * T33.Z, T23.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T33.X, T22.X, literal.x, +; CM-NEXT: AND_INT T33.X, T23.X, literal.x, ; CM-NEXT: MOV T33.Y, 0.0, -; CM-NEXT: LSHR * T34.Z, T22.W, literal.y, +; CM-NEXT: LSHR * T34.Z, T23.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T34.X, T22.W, literal.x, +; CM-NEXT: AND_INT T34.X, T23.W, literal.x, ; CM-NEXT: MOV T34.Y, 0.0, -; CM-NEXT: LSHR * T35.Z, T22.Z, literal.y, +; CM-NEXT: LSHR * T35.Z, T23.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T35.X, T22.Z, literal.x, +; CM-NEXT: AND_INT T35.X, T23.Z, literal.x, ; CM-NEXT: MOV T35.Y, 0.0, -; CM-NEXT: MOV * T23.W, 0.0, +; CM-NEXT: MOV * T19.W, 0.0, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; CM-NEXT: MOV * T24.W, 0.0, ; CM-NEXT: MOV * T25.W, 0.0, ; CM-NEXT: MOV * T26.W, 0.0, -; CM-NEXT: MOV * T20.W, 0.0, +; CM-NEXT: MOV * T21.W, 0.0, ; CM-NEXT: MOV * T27.W, 0.0, ; CM-NEXT: MOV * T28.W, 0.0, ; CM-NEXT: MOV * T29.W, 0.0, -; CM-NEXT: MOV * T19.W, 0.0, +; CM-NEXT: MOV * T20.W, 0.0, ; CM-NEXT: MOV * T30.W, 0.0, ; CM-NEXT: MOV * T31.W, 0.0, ; CM-NEXT: MOV * T32.W, 0.0, -; CM-NEXT: MOV * T21.W, 0.0, +; CM-NEXT: MOV * T22.W, 0.0, ; CM-NEXT: MOV * T33.W, 0.0, ; CM-NEXT: MOV * T34.W, 0.0, ; CM-NEXT: MOV * T35.W, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00) -; CM-NEXT: LSHR T22.X, PV.W, literal.x, +; CM-NEXT: LSHR T23.X, PV.W, literal.x, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T36.X, PV.W, literal.x, @@ -7957,45 +7963,45 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[10:11], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[10:11], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[8:9], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[8:9], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[14:15], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 @@ -8020,7 +8026,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v24, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 @@ -8058,7 +8064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8066,9 +8072,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 @@ -8091,8 +8097,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 @@ -8100,73 +8106,73 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v11 -; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[12:13], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v13, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: v_bfe_i32 v7, v13, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[12:13], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 -; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: v_bfe_i32 v7, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[14:15], 48 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] ; GCN-HSA-NEXT: v_bfe_i32 v19, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v21, v1, 0, 16 @@ -8180,50 +8186,50 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v17, v18, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v3, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v12, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 969cf2c457b98..b4e9376d82777 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1556,24 +1556,24 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; EG-NEXT: Fetch clause starting at 12: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV T1.W, literal.x, -; EG-NEXT: SETNE_INT * T0.W, KC0[2].W, 0.0, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 18: -; EG-NEXT: MOV T0.W, KC0[2].W, +; EG-NEXT: MOV T1.W, KC0[2].W, ; EG-NEXT: MOV * T2.W, KC0[3].X, -; EG-NEXT: MOV T1.W, literal.x, +; EG-NEXT: MOV T0.W, literal.x, ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV T0.W, KC0[2].Y, -; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, +; EG-NEXT: MOV T1.W, KC0[2].Y, +; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 26: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 27: -; EG-NEXT: LSHR * T1.X, T0.W, literal.x, +; EG-NEXT: LSHR * T1.X, T1.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp eq i32 %a, 0 @@ -1923,52 +1923,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s6 -; GFX9-NEXT: s_mul_i32 s2, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s3, s10, s4 +; GFX9-NEXT: s_mul_i32 s0, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s2, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s6 +; GFX9-NEXT: s_mul_i32 s1, s13, s10 ; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s11, s4 +; GFX9-NEXT: s_mul_i32 s3, s15, s8 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s6 +; GFX9-NEXT: s_mul_i32 s1, s12, s10 ; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s3, s10, s4 +; GFX9-NEXT: s_mul_i32 s3, s14, s8 ; GFX9-NEXT: s_add_u32 s3, s3, s1 ; GFX9-NEXT: s_addc_u32 s2, s2, s0 -; GFX9-NEXT: s_mul_i32 s10, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s8 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s1, s4, s9 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 -; GFX9-NEXT: s_add_u32 s1, s1, s10 -; GFX9-NEXT: s_addc_u32 s6, s6, 0 -; GFX9-NEXT: s_add_u32 s6, s7, s6 -; GFX9-NEXT: s_addc_u32 s7, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s6 -; GFX9-NEXT: s_addc_u32 s6, s10, s7 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 +; GFX9-NEXT: s_add_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s1, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s5, s5, s3 -; GFX9-NEXT: s_addc_u32 s6, s6, s2 -; GFX9-NEXT: s_mul_i32 s2, s4, s8 +; GFX9-NEXT: s_add_u32 s9, s9, s3 +; GFX9-NEXT: s_addc_u32 s10, s10, s2 +; GFX9-NEXT: s_mul_i32 s2, s8, s12 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index ce1a9ad58f011..47d06fa30a01e 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -4,8 +4,8 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 +# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 --- name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 9e6efc79d44f0..a462c19ce645d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -821,12 +821,12 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 @@ -847,8 +847,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_addk_i32 s4, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff ; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc @@ -869,10 +869,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 8526efa18767d..415ed89668abb 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -91,20 +91,20 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -129,24 +129,24 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -169,7 +169,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -605,20 +605,20 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -643,24 +643,24 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -683,7 +683,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -786,20 +786,20 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -824,24 +824,24 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -864,7 +864,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 09cd01af5bccb..d19a9233e118b 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -63,101 +63,101 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v23, s18, 28 ; CHECK-NEXT: v_writelane_b32 v23, s19, 29 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 30 -; CHECK-NEXT: v_writelane_b32 v23, s3, 31 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 32 -; CHECK-NEXT: v_writelane_b32 v23, s5, 33 -; CHECK-NEXT: v_writelane_b32 v23, s6, 34 -; CHECK-NEXT: v_writelane_b32 v23, s7, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 +; CHECK-NEXT: v_writelane_b32 v23, s4, 30 +; CHECK-NEXT: v_writelane_b32 v23, s5, 31 +; CHECK-NEXT: v_writelane_b32 v23, s6, 32 +; CHECK-NEXT: v_writelane_b32 v23, s7, 33 +; CHECK-NEXT: v_writelane_b32 v23, s8, 34 +; CHECK-NEXT: v_writelane_b32 v23, s9, 35 +; CHECK-NEXT: v_writelane_b32 v23, s10, 36 +; CHECK-NEXT: v_writelane_b32 v23, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ; def s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ; def s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:43] +; CHECK-NEXT: ; def s[44:51] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 44 -; CHECK-NEXT: v_writelane_b32 v23, s1, 45 -; CHECK-NEXT: v_writelane_b32 v23, s2, 46 -; CHECK-NEXT: v_writelane_b32 v23, s3, 47 -; CHECK-NEXT: v_writelane_b32 v23, s4, 48 -; CHECK-NEXT: v_writelane_b32 v23, s5, 49 -; CHECK-NEXT: v_writelane_b32 v23, s6, 50 -; CHECK-NEXT: v_writelane_b32 v23, s7, 51 -; CHECK-NEXT: v_writelane_b32 v23, s8, 52 -; CHECK-NEXT: v_writelane_b32 v23, s9, 53 -; CHECK-NEXT: v_writelane_b32 v23, s10, 54 -; CHECK-NEXT: v_writelane_b32 v23, s11, 55 -; CHECK-NEXT: v_writelane_b32 v23, s12, 56 -; CHECK-NEXT: v_writelane_b32 v23, s13, 57 -; CHECK-NEXT: v_writelane_b32 v23, s14, 58 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s15, 59 +; CHECK-NEXT: v_writelane_b32 v23, s0, 38 +; CHECK-NEXT: v_writelane_b32 v23, s1, 39 +; CHECK-NEXT: v_writelane_b32 v23, s2, 40 +; CHECK-NEXT: v_writelane_b32 v23, s3, 41 +; CHECK-NEXT: v_writelane_b32 v23, s4, 42 +; CHECK-NEXT: v_writelane_b32 v23, s5, 43 +; CHECK-NEXT: v_writelane_b32 v23, s6, 44 +; CHECK-NEXT: v_writelane_b32 v23, s7, 45 +; CHECK-NEXT: v_writelane_b32 v23, s8, 46 +; CHECK-NEXT: v_writelane_b32 v23, s9, 47 +; CHECK-NEXT: v_writelane_b32 v23, s10, 48 +; CHECK-NEXT: v_writelane_b32 v23, s11, 49 +; CHECK-NEXT: v_writelane_b32 v23, s12, 50 +; CHECK-NEXT: v_writelane_b32 v23, s13, 51 +; CHECK-NEXT: v_writelane_b32 v23, s14, 52 +; CHECK-NEXT: v_writelane_b32 v23, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 54 +; CHECK-NEXT: v_writelane_b32 v23, s1, 55 +; CHECK-NEXT: v_writelane_b32 v23, s2, 56 +; CHECK-NEXT: v_writelane_b32 v23, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 60 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v23, s1, 61 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v23, s2, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v23, s3, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v23, s0, 58 +; CHECK-NEXT: v_writelane_b32 v23, s1, 59 +; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 61 +; CHECK-NEXT: v_writelane_b32 v23, s4, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v23, s5, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 4 -; CHECK-NEXT: v_writelane_b32 v0, s1, 5 -; CHECK-NEXT: v_writelane_b32 v0, s2, 6 -; CHECK-NEXT: v_writelane_b32 v0, s3, 7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 8 -; CHECK-NEXT: v_writelane_b32 v0, s5, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 -; CHECK-NEXT: v_writelane_b32 v0, s8, 12 -; CHECK-NEXT: v_writelane_b32 v0, s9, 13 -; CHECK-NEXT: v_writelane_b32 v0, s10, 14 -; CHECK-NEXT: v_writelane_b32 v0, s11, 15 -; CHECK-NEXT: v_writelane_b32 v0, s12, 16 -; CHECK-NEXT: v_writelane_b32 v0, s13, 17 -; CHECK-NEXT: v_writelane_b32 v0, s14, 18 -; CHECK-NEXT: v_writelane_b32 v0, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 3 +; CHECK-NEXT: v_writelane_b32 v0, s2, 4 +; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s12, 14 +; CHECK-NEXT: v_writelane_b32 v0, s13, 15 +; CHECK-NEXT: v_writelane_b32 v0, s14, 16 +; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 18 +; CHECK-NEXT: v_writelane_b32 v0, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -245,102 +245,102 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 30 ; CHECK-NEXT: v_readlane_b32 s1, v23, 31 +; CHECK-NEXT: v_readlane_b32 s2, v23, 32 +; CHECK-NEXT: v_readlane_b32 s3, v23, 33 +; CHECK-NEXT: v_readlane_b32 s4, v23, 34 +; CHECK-NEXT: v_readlane_b32 s5, v23, 35 +; CHECK-NEXT: v_readlane_b32 s6, v23, 36 +; CHECK-NEXT: v_readlane_b32 s7, v23, 37 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 36 -; CHECK-NEXT: v_readlane_b32 s1, v23, 37 -; CHECK-NEXT: v_readlane_b32 s2, v23, 38 -; CHECK-NEXT: v_readlane_b32 s3, v23, 39 -; CHECK-NEXT: v_readlane_b32 s4, v23, 40 -; CHECK-NEXT: v_readlane_b32 s5, v23, 41 -; CHECK-NEXT: v_readlane_b32 s6, v23, 42 -; CHECK-NEXT: v_readlane_b32 s7, v23, 43 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 44 -; CHECK-NEXT: v_readlane_b32 s1, v23, 45 -; CHECK-NEXT: v_readlane_b32 s2, v23, 46 -; CHECK-NEXT: v_readlane_b32 s3, v23, 47 -; CHECK-NEXT: v_readlane_b32 s4, v23, 48 -; CHECK-NEXT: v_readlane_b32 s5, v23, 49 -; CHECK-NEXT: v_readlane_b32 s6, v23, 50 -; CHECK-NEXT: v_readlane_b32 s7, v23, 51 +; CHECK-NEXT: v_readlane_b32 s0, v23, 38 +; CHECK-NEXT: v_readlane_b32 s1, v23, 39 +; CHECK-NEXT: v_readlane_b32 s2, v23, 40 +; CHECK-NEXT: v_readlane_b32 s3, v23, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ; use s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ; use s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:43] +; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v23, 52 -; CHECK-NEXT: v_readlane_b32 s9, v23, 53 -; CHECK-NEXT: v_readlane_b32 s10, v23, 54 -; CHECK-NEXT: v_readlane_b32 s11, v23, 55 -; CHECK-NEXT: v_readlane_b32 s12, v23, 56 -; CHECK-NEXT: v_readlane_b32 s13, v23, 57 -; CHECK-NEXT: v_readlane_b32 s14, v23, 58 -; CHECK-NEXT: v_readlane_b32 s15, v23, 59 +; CHECK-NEXT: v_readlane_b32 s4, v23, 42 +; CHECK-NEXT: v_readlane_b32 s5, v23, 43 +; CHECK-NEXT: v_readlane_b32 s6, v23, 44 +; CHECK-NEXT: v_readlane_b32 s7, v23, 45 +; CHECK-NEXT: v_readlane_b32 s8, v23, 46 +; CHECK-NEXT: v_readlane_b32 s9, v23, 47 +; CHECK-NEXT: v_readlane_b32 s10, v23, 48 +; CHECK-NEXT: v_readlane_b32 s11, v23, 49 +; CHECK-NEXT: v_readlane_b32 s12, v23, 50 +; CHECK-NEXT: v_readlane_b32 s13, v23, 51 +; CHECK-NEXT: v_readlane_b32 s14, v23, 52 +; CHECK-NEXT: v_readlane_b32 s15, v23, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 60 -; CHECK-NEXT: v_readlane_b32 s1, v23, 61 -; CHECK-NEXT: v_readlane_b32 s2, v23, 62 -; CHECK-NEXT: v_readlane_b32 s3, v23, 63 -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_readlane_b32 s6, v0, 2 -; CHECK-NEXT: v_readlane_b32 s7, v0, 3 +; CHECK-NEXT: v_readlane_b32 s0, v23, 54 +; CHECK-NEXT: v_readlane_b32 s1, v23, 55 +; CHECK-NEXT: v_readlane_b32 s2, v23, 56 +; CHECK-NEXT: v_readlane_b32 s3, v23, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s4, v23, 62 +; CHECK-NEXT: v_readlane_b32 s5, v23, 63 +; CHECK-NEXT: v_readlane_b32 s6, v0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 4 -; CHECK-NEXT: v_readlane_b32 s1, v0, 5 -; CHECK-NEXT: v_readlane_b32 s2, v0, 6 -; CHECK-NEXT: v_readlane_b32 s3, v0, 7 -; CHECK-NEXT: v_readlane_b32 s4, v0, 8 -; CHECK-NEXT: v_readlane_b32 s5, v0, 9 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 -; CHECK-NEXT: v_readlane_b32 s8, v0, 12 -; CHECK-NEXT: v_readlane_b32 s9, v0, 13 -; CHECK-NEXT: v_readlane_b32 s10, v0, 14 -; CHECK-NEXT: v_readlane_b32 s11, v0, 15 -; CHECK-NEXT: v_readlane_b32 s12, v0, 16 -; CHECK-NEXT: v_readlane_b32 s13, v0, 17 -; CHECK-NEXT: v_readlane_b32 s14, v0, 18 -; CHECK-NEXT: v_readlane_b32 s15, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: v_readlane_b32 s4, v0, 6 +; CHECK-NEXT: v_readlane_b32 s5, v0, 7 +; CHECK-NEXT: v_readlane_b32 s6, v0, 8 +; CHECK-NEXT: v_readlane_b32 s7, v0, 9 +; CHECK-NEXT: v_readlane_b32 s8, v0, 10 +; CHECK-NEXT: v_readlane_b32 s9, v0, 11 +; CHECK-NEXT: v_readlane_b32 s10, v0, 12 +; CHECK-NEXT: v_readlane_b32 s11, v0, 13 +; CHECK-NEXT: v_readlane_b32 s12, v0, 14 +; CHECK-NEXT: v_readlane_b32 s13, v0, 15 +; CHECK-NEXT: v_readlane_b32 s14, v0, 16 +; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 18 +; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 20 ; CHECK-NEXT: v_readlane_b32 s1, v0, 21 ; CHECK-NEXT: v_readlane_b32 s2, v0, 22 ; CHECK-NEXT: v_readlane_b32 s3, v0, 23 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 974cb71900a4d..d59660751cc18 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,17 +16,17 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -56,22 +56,22 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -101,56 +101,56 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_ashr_i32 s3, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_ashr_i32 s5, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s3, s1, s3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, 0, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s1, s1, s8 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 -; GFX9-NEXT: s_add_i32 s8, s8, s1 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 -; GFX9-NEXT: s_mul_i32 s8, s1, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s8 -; GFX9-NEXT: s_add_i32 s9, s1, 1 -; GFX9-NEXT: s_sub_i32 s8, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_cselect_b32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s1, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s8, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_sub_i32 s8, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s8, s5 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32: @@ -1373,17 +1373,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32_4: @@ -1672,20 +1672,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1710,25 +1710,25 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1753,7 +1753,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i23: @@ -1859,20 +1859,20 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1895,25 +1895,25 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i24: @@ -1997,17 +1997,17 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2040,22 +2040,22 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2088,59 +2088,59 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_bfe_i32 s3, s2, 0x190000 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10018 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x190000 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x10018 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s5, s4, 0x190000 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10018 +; GFX9-NEXT: s_add_i32 s5, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s3 +; GFX9-NEXT: s_xor_b32 s6, s4, s6 +; GFX9-NEXT: s_xor_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s1, s1, s8 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 -; GFX9-NEXT: s_add_i32 s8, s8, s1 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 -; GFX9-NEXT: s_mul_i32 s8, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s8 -; GFX9-NEXT: s_add_i32 s9, s1, 1 -; GFX9-NEXT: s_sub_i32 s8, s0, s3 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_cselect_b32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s1, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s0, s8, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190000 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_sub_i32 s8, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s8, s5 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i25: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index da8896db90494..705a2af739590 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -239,14 +239,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, v3 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -318,33 +318,33 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_mul_lo_u32 v9, v2, v5 +; GCN-NEXT: v_mul_lo_u32 v9, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, v1, v8 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v3, vcc -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v2 +; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v2, vcc +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 ; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] ; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 2, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GCN-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v6, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index b03353972ab66..adce63c7e45e7 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -692,17 +692,17 @@ define amdgpu_kernel void @select_v2f16( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX11-NEXT: s_mov_b32 s14, -1 -; GFX11-NEXT: s_mov_b32 s15, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s14 -; GFX11-NEXT: s_mov_b32 s3, s15 -; GFX11-NEXT: s_mov_b32 s22, s14 -; GFX11-NEXT: s_mov_b32 s23, s15 -; GFX11-NEXT: s_mov_b32 s18, s14 -; GFX11-NEXT: s_mov_b32 s19, s15 -; GFX11-NEXT: s_mov_b32 s26, s14 -; GFX11-NEXT: s_mov_b32 s27, s15 +; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s20, s8 ; GFX11-NEXT: s_mov_b32 s21, s9 @@ -710,12 +710,12 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s17, s7 ; GFX11-NEXT: s_mov_b32 s24, s10 ; GFX11-NEXT: s_mov_b32 s25, s11 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s12, s4 -; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -730,7 +730,7 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index 3a5638344d751..eaf00e861ce97 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -15,8 +15,11 @@ ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 +; SGPR-NEXT: s_or_saveexec_b64 s[100:101], -1 +; SGPR-NEXT: s_mov_b64 exec, s[100:101] +; SGPR-NEXT: s_nop 2 +; SGPR-NEXT: buffer_store_dword v0, off, s[{{[0-9]+}}:[[HI]]], 0 ; SGPR-NEXT: ; kill: killed $vgpr1 -; SGPR-NEXT: s_nop 4 ; ALL: s_endpgm define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 02c18720d05f5..08db1e7fee259 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10191,19 +10191,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x80800 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10225,9 +10217,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen @@ -10496,25 +10496,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 -; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(4) +; GFX6-NEXT: v_mov_b32_e32 v0, v17 +; GFX6-NEXT: v_mov_b32_e32 v1, v18 +; GFX6-NEXT: v_mov_b32_e32 v2, v19 +; GFX6-NEXT: v_mov_b32_e32 v3, v20 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: v_mov_b32_e32 v20, v3 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: v_mov_b32_e32 v19, v2 +; GFX6-NEXT: v_mov_b32_e32 v18, v1 +; GFX6-NEXT: v_mov_b32_e32 v17, v0 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10634,20 +10632,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80800 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(2) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll index c056d35c56beb..314785cdbefd6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll @@ -136,13 +136,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 { ; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 -; GFX908: NumVgprs: 252 -; GFX900: ScratchSize: 1668 +; GFX908: NumVgprs: 254 +; GFX900: ScratchSize: 1796 ; GFX908: ScratchSize: 0 ; GFX900: VGPRBlocks: 63 -; GFX908: VGPRBlocks: 62 +; GFX908: VGPRBlocks: 63 ; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 252 +; GFX908: NumVGPRsForWavesPerEU: 254 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(ptr addrspace(1) %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c85d15bc2fcff..613349f32e2d5 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -218,14 +218,14 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v2, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -297,34 +297,34 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v3, vcc -; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc +; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v2 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v2, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v2 +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] @@ -357,25 +357,25 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 ; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7 -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[6:7], v10, v11 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[7:8] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[7:8] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v7 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 @@ -425,15 +425,15 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 -; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 -; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v6, v2, v9 +; GCN-IR-NEXT: v_mul_hi_u32 v7, v2, v8 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v8 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v8 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 @@ -1584,22 +1584,22 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 @@ -1647,14 +1647,14 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll index 4d1d88d643f15..4f33e19835172 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -22,16 +22,16 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, global@rel32@lo+1948 -; CHECK-NEXT: s_addc_u32 s17, s17, global@rel32@hi+1956 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, eggs@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s19, s19, eggs@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, global@rel32@lo+1948 +; CHECK-NEXT: s_addc_u32 s19, s19, global@rel32@hi+1956 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s18 +; CHECK-NEXT: v_mov_b32_e32 v1, s19 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, eggs@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, eggs@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[16:17] ; CHECK-NEXT: .LBB0_3: ; %LeafBlock1 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index b7d3a96042569..7f989e30118b9 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -89,15 +89,13 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: v_writelane_b32 v2, s1, 8 ; CHECK-NEXT: v_readlane_b32 s0, v2, 0 ; CHECK-NEXT: v_readlane_b32 s2, v2, 11 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: s_add_i32 s2, s2, s0 ; CHECK-NEXT: v_writelane_b32 v2, s2, 11 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: v_readlane_b32 s0, v2, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 88a49cfc6e669..cf30131b8ab58 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -818,9 +818,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 @@ -855,51 +855,51 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 -; GCN-NEXT: v_mul_hi_u32 v12, v6, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v7, v13 +; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 +; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 +; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 ; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 -; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 -; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 -; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18 -; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v20 +; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 ; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 ; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; ; GFX1030-LABEL: udiv_v4i32: @@ -1848,20 +1848,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1890,25 +1890,25 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1937,7 +1937,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i24: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 972e03b8960d6..894c96acbbcd6 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -333,24 +333,24 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v5, s[6:7], v8, v9 +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v8, v9 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[5:6] +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 @@ -400,15 +400,15 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v7 -; GCN-IR-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 +; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b22ae9b9c8527..b8d18f56b7602 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -334,8 +334,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 @@ -369,8 +369,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 @@ -378,10 +378,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX906-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v31 +; GFX906-NEXT: v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v31 ; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v27 ; GFX906-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -409,7 +409,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v30 +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v10 ; GFX906-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -859,15 +859,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v4 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 24, v2 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v2 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v3 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 @@ -1287,16 +1287,16 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v62 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v63 ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e274fc2592149..94b822ac48875 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -523,22 +523,22 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s4, s4, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 ; GFX1032-NEXT: s_or_b32 s2, s5, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 @@ -546,12 +546,12 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr4 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -1803,64 +1803,64 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-NEXT: s_branch .LBB33_2 ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1032-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1032-NEXT: s_cbranch_execz .LBB33_4 ; GFX1032-NEXT: .LBB33_2: ; %loop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v4 +; GFX1032-NEXT: v_mov_b32_e32 v1, v5 +; GFX1032-NEXT: v_mov_b32_e32 v2, v6 +; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1032-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1032-NEXT: ; implicit-def: $vgpr8 ; GFX1032-NEXT: .LBB33_4: ; %break ; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, v4 -; GFX1032-NEXT: v_mov_b32_e32 v1, v5 -; GFX1032-NEXT: v_mov_b32_e32 v2, v6 -; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_loop_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_wqm_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-NEXT: s_branch .LBB33_2 ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1064-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1064-NEXT: s_cbranch_execz .LBB33_4 ; GFX1064-NEXT: .LBB33_2: ; %loop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v4 +; GFX1064-NEXT: v_mov_b32_e32 v1, v5 +; GFX1064-NEXT: v_mov_b32_e32 v2, v6 +; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1064-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1064-NEXT: ; implicit-def: $vgpr8 ; GFX1064-NEXT: .LBB33_4: ; %break ; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, v4 -; GFX1064-NEXT: v_mov_b32_e32 v1, v5 -; GFX1064-NEXT: v_mov_b32_e32 v2, v6 -; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: ; return to shader part epilog entry: br label %loop diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll index 6bf1a5ed21454..9d7570b9a929e 100644 --- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll @@ -350,16 +350,16 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: r13:12 = add(r5:4,r7:6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p0 = cmp.gtu(r5:4,r3:2) -; CHECK-NEXT: p1 = cmp.eq(r5:4,r9:8) +; CHECK-NEXT: p1 = cmp.gtu(r5:4,r3:2) +; CHECK-NEXT: p0 = cmp.eq(r5:4,r9:8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = mux(p0,r2,r12) -; CHECK-NEXT: r14 = mux(p0,r3,r13) +; CHECK-NEXT: r1 = mux(p1,r2,r12) +; CHECK-NEXT: r14 = mux(p1,r3,r13) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r10 = mux(p1,r2,r1) -; CHECK-NEXT: r11 = mux(p1,r3,r14) +; CHECK-NEXT: r10 = mux(p0,r2,r1) +; CHECK-NEXT: r11 = mux(p0,r3,r14) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: memd_locked(r0,p0) = r11:10 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll index 800f89005e310..66db73f5c69f6 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -493,7 +493,7 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: r3:2 = combine(#1,#8) -; CHECK-NEXT: v4 = vmem(r0+#0) +; CHECK-NEXT: v5 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r4) @@ -504,11 +504,11 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r7) ; CHECK-NEXT: r5 = #32 -; CHECK-NEXT: v8.w = vasl(v6.w,r3) -; CHECK-NEXT: v6.cur = vmem(r0+#1) +; CHECK-NEXT: v8.w = vasl(v4.w,r3) +; CHECK-NEXT: v4.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v4.w,r3) +; CHECK-NEXT: v7.w = vasl(v5.w,r3) ; CHECK-NEXT: v12 = vxor(v12,v12) ; CHECK-NEXT: v8.w = vsub(v8.w,v1.w) ; CHECK-NEXT: v0 = vmem(r0+#3) @@ -517,11 +517,11 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: v11.w = vasl(v0.w,r3) ; CHECK-NEXT: v7.w = vsub(v7.w,v1.w) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v4.w) +; CHECK-NEXT: q0 = vcmp.gt(v12.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v2.w,r3) -; CHECK-NEXT: q1 = vcmp.gt(v12.w,v6.w) +; CHECK-NEXT: q1 = vcmp.gt(v12.w,v4.w) ; CHECK-NEXT: v11.w = vsub(v11.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -530,82 +530,82 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v8.w = vasr(v8.w,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21 = vsplat(r3) +; CHECK-NEXT: v22 = vsplat(r3) ; CHECK-NEXT: v7.w = vasr(v7.w,r6) -; CHECK-NEXT: v18.w = vsub(v9.w,v1.w) +; CHECK-NEXT: v19.w = vsub(v9.w,v1.w) ; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.w = vasl(v6.w,r2) -; CHECK-NEXT: v26 = vmux(q1,v1,v21) -; CHECK-NEXT: v24 = vmux(q0,v1,v21) +; CHECK-NEXT: v20.w = vasl(v4.w,r2) +; CHECK-NEXT: v27 = vmux(q1,v1,v22) +; CHECK-NEXT: v25 = vmux(q0,v1,v22) ; CHECK-NEXT: v7.w = vsub(v10.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v4.w,r2) +; CHECK-NEXT: v6.w = vasl(v5.w,r2) ; CHECK-NEXT: v8.w = vmin(v8.w,v13.w) -; CHECK-NEXT: v9 = vor(v19,v1) -; CHECK-NEXT: v20.w = vmin(v7.w,v13.w) +; CHECK-NEXT: v9 = vor(v20,v1) +; CHECK-NEXT: v21.w = vmin(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasr(v18.w,r6) +; CHECK-NEXT: v5.w = vasr(v19.w,r6) ; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w) -; CHECK-NEXT: v5 = vor(v5,v1) -; CHECK-NEXT: q2 = vcmp.gt(v20.w,v12.w) +; CHECK-NEXT: v6 = vor(v6,v1) +; CHECK-NEXT: q2 = vcmp.gt(v21.w,v12.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.w = vasr(v11.w,r6) -; CHECK-NEXT: v4.w = vsub(v10.w,v4.w) +; CHECK-NEXT: v5.w = vsub(v10.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v2.w,r2) ; CHECK-NEXT: v10.w = vsub(v10.w,v11.w) -; CHECK-NEXT: v4.w = vmin(v4.w,v13.w) +; CHECK-NEXT: v5.w = vmin(v5.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.w = vasl(v0.w,r2) +; CHECK-NEXT: v23.w = vasl(v0.w,r2) ; CHECK-NEXT: v3 = vor(v3,v1) ; CHECK-NEXT: v10.w = vmin(v10.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w) -; CHECK-NEXT: v6 = vor(v22,v1) +; CHECK-NEXT: v4 = vor(v23,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vlsr(v5.w,v20.w) -; CHECK-NEXT: v25.w = vsub(v12.w,v8.w) +; CHECK-NEXT: v6.w = vlsr(v6.w,v21.w) +; CHECK-NEXT: v26.w = vsub(v12.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w) -; CHECK-NEXT: v23.w = vsub(v12.w,v5.w) -; CHECK-NEXT: v8 = vmux(q1,v25,v8) +; CHECK-NEXT: v3.w = vlsr(v3.w,v5.w) +; CHECK-NEXT: v24.w = vsub(v12.w,v6.w) +; CHECK-NEXT: v8 = vmux(q1,v26,v8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vlsr(v6.w,v10.w) -; CHECK-NEXT: v5 = vmux(q0,v23,v5) +; CHECK-NEXT: v4.w = vlsr(v4.w,v10.w) +; CHECK-NEXT: v6 = vmux(q0,v24,v6) ; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w) -; CHECK-NEXT: v27.w = vsub(v12.w,v3.w) +; CHECK-NEXT: v28.w = vsub(v12.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vmux(q3,v8,v26) -; CHECK-NEXT: v28.w = vsub(v12.w,v6.w) +; CHECK-NEXT: v2 = vmux(q3,v8,v27) +; CHECK-NEXT: v29.w = vsub(v12.w,v4.w) ; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w) -; CHECK-NEXT: v5 = vmux(q2,v5,v24) +; CHECK-NEXT: v6 = vmux(q2,v6,v25) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29 = vmux(q0,v1,v21) -; CHECK-NEXT: v3 = vmux(q0,v27,v3) -; CHECK-NEXT: q2 = vcmp.gt(v4.w,v12.w) -; CHECK-NEXT: v30 = vmux(q3,v28,v6) +; CHECK-NEXT: v30 = vmux(q0,v1,v22) +; CHECK-NEXT: v3 = vmux(q0,v28,v3) +; CHECK-NEXT: q2 = vcmp.gt(v5.w,v12.w) +; CHECK-NEXT: v4 = vmux(q3,v29,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vpack(v2.w,v5.w):sat -; CHECK-NEXT: v1 = vmux(q3,v1,v21) +; CHECK-NEXT: v2.h = vpack(v2.w,v6.w):sat +; CHECK-NEXT: v1 = vmux(q3,v1,v22) ; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w) -; CHECK-NEXT: v0 = vmux(q2,v3,v29) +; CHECK-NEXT: v0 = vmux(q2,v3,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v30,v1) +; CHECK-NEXT: v1 = vmux(q3,v4,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat @@ -1547,55 +1547,59 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) -; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: v5 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: v3 = vsplat(r4) ; CHECK-NEXT: r5 = #30 ; CHECK-NEXT: r6 = #24 ; CHECK-NEXT: v2 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) -; CHECK-NEXT: v8.w = vasl(v5.w,r4) -; CHECK-NEXT: v13 = vxor(v13,v13) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v8.w = vasl(v5.w,r2) ; CHECK-NEXT: v0 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v9.w = vasl(v2.w,r4) +; CHECK-NEXT: v9.w = vasl(v2.w,r2) +; CHECK-NEXT: v13 = vxor(v13,v13) ; CHECK-NEXT: v8.w = vsub(v8.w,v3.w) ; CHECK-NEXT: v1 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v0.w,r4) -; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: v20 = vsplat(r4) +; CHECK-NEXT: v12.w = vasl(v0.w,r2) ; CHECK-NEXT: v9.w = vsub(v9.w,v3.w) -; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) +; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r4 = #32 -; CHECK-NEXT: v11.w = vasl(v1.w,r4) +; CHECK-NEXT: v11.w = vasl(v1.w,r2) +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) ; CHECK-NEXT: v12.w = vsub(v12.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20 = vsplat(r4) -; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v22 = vsplat(r2) +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasr(v9.w,r6) ; CHECK-NEXT: v8.w = vsub(v14.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v5.w,r2) +; CHECK-NEXT: v6.w = vasl(v5.w,r3) ; CHECK-NEXT: v9.w = vsub(v14.w,v9.w) ; CHECK-NEXT: v8.w = vmin(v8.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v2.w,r2) +; CHECK-NEXT: v7.w = vasl(v2.w,r3) ; CHECK-NEXT: v6 = vor(v6,v3) ; CHECK-NEXT: v9.w = vmin(v9.w,v20.w) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w) @@ -1610,17 +1614,15 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v5.w = vsub(v14.w,v19.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v1.w,r2) +; CHECK-NEXT: v4.w = vasl(v1.w,r3) ; CHECK-NEXT: v21.w = vsub(v14.w,v12.w) ; CHECK-NEXT: v5.w = vmin(v5.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = ##2147483647 -; CHECK-NEXT: v10.w = vasl(v0.w,r2) +; CHECK-NEXT: v10.w = vasl(v0.w,r3) ; CHECK-NEXT: v4 = vor(v4,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r2) ; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) ; CHECK-NEXT: v3 = vor(v10,v3) ; CHECK-NEXT: v10.w = vmin(v21.w,v20.w) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll index 4c651ae474053..5cfa09b0822bb 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -19,9 +19,9 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vsplat(r7) +; CHECK-NEXT: v2.h = vsplat(r7) ; CHECK-NEXT: r3:2 = combine(#31,#5) -; CHECK-NEXT: v2.h = vabs(v0.h) +; CHECK-NEXT: v3.h = vabs(v0.h) ; CHECK-NEXT: v4.h = vabs(v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -31,62 +31,62 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = ##32768 -; CHECK-NEXT: v5.uh = vcl0(v2.uh) +; CHECK-NEXT: v5.uh = vcl0(v3.uh) ; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v6.uh = vcl0(v4.uh) -; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vmux(q0,v10,v9) -; CHECK-NEXT: v6.h = vadd(v6.h,v3.h) +; CHECK-NEXT: v6.h = vadd(v6.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) +; CHECK-NEXT: v3.h = vasl(v3.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vasl(v4.h,v6.h) -; CHECK-NEXT: v13 = vand(v2,v8) -; CHECK-NEXT: v11.h = vadd(v2.h,v7.h) +; CHECK-NEXT: v13 = vand(v3,v8) +; CHECK-NEXT: v11.h = vadd(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.h = vadd(v4.h,v7.h) ; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h) ; CHECK-NEXT: v8 = vand(v4,v8) -; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v11.uh) +; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v9,v3) +; CHECK-NEXT: v13 = vmux(q2,v9,v2) ; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h) ; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2) -; CHECK-NEXT: v22 = vmux(q2,v9,v3) -; CHECK-NEXT: v21 = vmux(q1,v3,v9) -; CHECK-NEXT: v3 = vmux(q3,v3,v9) +; CHECK-NEXT: v22 = vmux(q2,v9,v2) +; CHECK-NEXT: v21 = vmux(q1,v2,v9) +; CHECK-NEXT: v2 = vmux(q3,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2) ; CHECK-NEXT: v13.h = vadd(v11.h,v13.h) ; CHECK-NEXT: v24.h = vadd(v20.h,v22.h) -; CHECK-NEXT: v3.h = vadd(v3.h,v7.h) +; CHECK-NEXT: v2.h = vadd(v2.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uh = vlsr(v2.uh,r2) +; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2) ; CHECK-NEXT: v23.h = vadd(v21.h,v7.h) -; CHECK-NEXT: v3.h = vsub(v3.h,v6.h) +; CHECK-NEXT: v2.h = vsub(v2.h,v6.h) ; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7) -; CHECK-NEXT: v2.h = vsub(v23.h,v5.h) -; CHECK-NEXT: q1 = vcmp.eq(v12.h,v11.h) -; CHECK-NEXT: q2 = vcmp.eq(v19.h,v20.h) +; CHECK-NEXT: v3.h = vsub(v23.h,v5.h) +; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h) +; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7) @@ -95,29 +95,29 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7) -; CHECK-NEXT: v5 = vmux(q1,v25,v11) +; CHECK-NEXT: v5 = vmux(q2,v25,v11) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7) ; CHECK-NEXT: v5 = vor(v27,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,r4) -; CHECK-NEXT: v4 = vmux(q2,v26,v4) -; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) +; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v4 = vmux(q1,v26,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v2.h = vasl(v2.h,r4) ; CHECK-NEXT: v4 = vor(v28,v4) -; CHECK-NEXT: v29 = vor(v5,v2) +; CHECK-NEXT: v29 = vor(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vor(v4,v3) +; CHECK-NEXT: v2 = vor(v4,v2) ; CHECK-NEXT: v31 = vmux(q3,v9,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v9,v3) +; CHECK-NEXT: v30 = vmux(q2,v9,v2) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -236,190 +236,188 @@ define void @s8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r7) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v15 = vxor(v15,v15) +; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16 = vsplat(r6) -; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v7:6.h = vunpack(v1.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v18 = vsplat(r5) +; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: v1:0.w = vunpack(v8.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7:6.w = vunpack(v6.h) -; CHECK-NEXT: v8.w = vabs(v1.w) ; CHECK-NEXT: v5.w = vabs(v0.w) +; CHECK-NEXT: v10.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vabs(v6.w) -; CHECK-NEXT: v11.w = vabs(v7.w) -; CHECK-NEXT: q0 = vcmp.gt(v15.w,v6.w) +; CHECK-NEXT: v26.w = vabs(v6.w) +; CHECK-NEXT: v13.w = vabs(v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vcl0(v8.uw) -; CHECK-NEXT: v17 = vmux(q0,v16,v15) -; CHECK-NEXT: q0 = vcmp.gt(v15.w,v7.w) +; CHECK-NEXT: v9.uw = vcl0(v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vcl0(v9.uw) -; CHECK-NEXT: v12.w = vadd(v12.w,v2.w) +; CHECK-NEXT: v12.uw = vcl0(v26.uw) +; CHECK-NEXT: v9.w = vadd(v9.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vcl0(v11.uw) -; CHECK-NEXT: v13.w = vadd(v13.w,v2.w) +; CHECK-NEXT: v14.uw = vcl0(v13.uw) +; CHECK-NEXT: v15.w = vadd(v12.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.uw = vcl0(v5.uw) -; CHECK-NEXT: v14.w = vadd(v14.w,v2.w) +; CHECK-NEXT: v11.uw = vcl0(v10.uw) +; CHECK-NEXT: v12.w = vadd(v14.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v9.w,v13.w) -; CHECK-NEXT: v10.w = vadd(v10.w,v2.w) +; CHECK-NEXT: v27.w = vasl(v26.w,v15.w) +; CHECK-NEXT: v11.w = vadd(v11.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v11.w,v14.w) -; CHECK-NEXT: v20 = vand(v9,v4) -; CHECK-NEXT: v19.w = vadd(v9.w,v3.w) +; CHECK-NEXT: v13.w = vasl(v13.w,v12.w) +; CHECK-NEXT: v20 = vand(v27,v4) +; CHECK-NEXT: v19.w = vadd(v27.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v8.w,v12.w) -; CHECK-NEXT: v23.w = vadd(v11.w,v3.w) -; CHECK-NEXT: q3 = vcmp.eq(v20.w,v15.w) -; CHECK-NEXT: v28 = vand(v11,v4) +; CHECK-NEXT: v16.w = vasl(v5.w,v9.w) +; CHECK-NEXT: v5 = vxor(v5,v5) +; CHECK-NEXT: v23.w = vadd(v13.w,v3.w) +; CHECK-NEXT: v28 = vand(v13,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q3,v15,v2) -; CHECK-NEXT: q3 = vcmp.eq(v28.w,v15.w) -; CHECK-NEXT: v22 = vand(v8,v4) -; CHECK-NEXT: q2 = vcmp.gt(v9.uw,v19.uw) +; CHECK-NEXT: v17.w = vasl(v10.w,v11.w) +; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w) +; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw) +; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: v27 = vmux(q3,v15,v2) -; CHECK-NEXT: q1 = vcmp.eq(v22.w,v15.w) -; CHECK-NEXT: v24 = vmux(q2,v2,v15) +; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v5,v2) +; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w) +; CHECK-NEXT: v22 = vand(v17,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vlsr(v19.uw,r2) -; CHECK-NEXT: v26 = vmux(q1,v15,v2) -; CHECK-NEXT: v13.w = vsub(v24.w,v13.w) +; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2) +; CHECK-NEXT: v27 = vmux(q3,v5,v2) +; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w) +; CHECK-NEXT: v24 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) -; CHECK-NEXT: v22.w = vadd(v9.w,v30.w) -; CHECK-NEXT: v30.w = vadd(v8.w,v3.w) -; CHECK-NEXT: q2 = vcmp.eq(v21.w,v9.w) +; CHECK-NEXT: v22.w = vadd(v14.w,v30.w) +; CHECK-NEXT: v30.w = vadd(v17.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,v10.w) +; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2) ; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) -; CHECK-NEXT: v13.w = vadd(v13.w,v18.w) +; CHECK-NEXT: v3.w = vadd(v16.w,v3.w) +; CHECK-NEXT: v4 = vand(v16,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v11.uw,r2) -; CHECK-NEXT: v3.w = vadd(v5.w,v3.w) -; CHECK-NEXT: v4 = vand(v5,v4) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) -; CHECK-NEXT: v31 = vmux(q0,v16,v15) -; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v3.uw) +; CHECK-NEXT: v18 = vmux(q0,v8,v5) +; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v26 = vmux(q1,v5,v2) +; CHECK-NEXT: v31 = vmux(q0,v8,v5) +; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vlsr(v9.uw,r0) -; CHECK-NEXT: v19 = vmux(q3,v20,v19) -; CHECK-NEXT: q3 = vcmp.eq(v4.w,v15.w) +; CHECK-NEXT: v10 = vsplat(r5) +; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v15.w = vsub(v24.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v19 = vor(v31,v19) +; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v14 = vmux(q2,v29,v14) +; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw) +; CHECK-NEXT: v15.w = vadd(v15.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) -; CHECK-NEXT: v9 = vmux(q2,v29,v9) -; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v23.uw) -; CHECK-NEXT: v29 = vmux(q3,v15,v2) +; CHECK-NEXT: v19 = vmux(q3,v20,v19) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w) +; CHECK-NEXT: v27 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) -; CHECK-NEXT: v27 = vmux(q2,v2,v15) -; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v30.uw) +; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw) ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) +; CHECK-NEXT: v29 = vmux(q3,v5,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v31 = vmux(q2,v2,v15) -; CHECK-NEXT: v2 = vmux(q0,v2,v15) -; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) +; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2) +; CHECK-NEXT: v19 = vor(v31,v19) +; CHECK-NEXT: v31 = vmux(q2,v2,v5) +; CHECK-NEXT: v2 = vmux(q0,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) -; CHECK-NEXT: v2.w = vsub(v2.w,v10.w) -; CHECK-NEXT: q3 = vcmp.eq(v8.w,v25.w) -; CHECK-NEXT: v22.w = vsub(v31.w,v12.w) +; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2) +; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v9.w) +; CHECK-NEXT: v11.w = vsub(v31.w,v11.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v28.uw,r0) -; CHECK-NEXT: v4.w = vsub(v27.w,v14.w) -; CHECK-NEXT: v8.w = vadd(v22.w,v18.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v18.w) +; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w) +; CHECK-NEXT: v4.w = vsub(v27.w,v12.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0) ; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) -; CHECK-NEXT: q2 = vcmp.gt(v15.w,v1.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v18.w) +; CHECK-NEXT: v21.w = vadd(v11.w,v10.w) +; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v5 = vmux(q3,v5,v11) -; CHECK-NEXT: q3 = vcmp.gt(v15.w,v0.w) -; CHECK-NEXT: v24 = vmux(q2,v16,v15) +; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0) +; CHECK-NEXT: v23 = vmux(q3,v16,v13) +; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w) +; CHECK-NEXT: v24 = vmux(q2,v8,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) -; CHECK-NEXT: v25 = vmux(q3,v16,v15) -; CHECK-NEXT: v5 = vor(v24,v5) -; CHECK-NEXT: v9 = vor(v17,v9) +; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) +; CHECK-NEXT: v8 = vmux(q3,v8,v5) +; CHECK-NEXT: v10 = vor(v24,v23) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v8.w,r4) -; CHECK-NEXT: v3 = vmux(q0,v23,v3) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) +; CHECK-NEXT: v9.w = vasl(v21.w,r4) +; CHECK-NEXT: v3 = vmux(q0,v22,v3) +; CHECK-NEXT: v14 = vor(v18,v14) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v3 = vor(v25,v3) -; CHECK-NEXT: v5 = vor(v5,v8) +; CHECK-NEXT: v3 = vor(v8,v3) +; CHECK-NEXT: v25 = vor(v10,v9) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.w = vasl(v13.w,r4) +; CHECK-NEXT: v15.w = vasl(v15.w,r4) ; CHECK-NEXT: v2 = vor(v3,v2) -; CHECK-NEXT: v27 = vmux(q2,v15,v5) +; CHECK-NEXT: v27 = vmux(q2,v5,v25) ; CHECK-NEXT: vmem(r1+#1) = v27.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.w = vasl(v4.w,r4) -; CHECK-NEXT: v29 = vmux(q3,v15,v2) -; CHECK-NEXT: q2 = vcmp.eq(v7.w,v15.w) +; CHECK-NEXT: v29 = vmux(q3,v5,v2) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w) ; CHECK-NEXT: vmem(r1+#0) = v29.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28 = vor(v19,v26) -; CHECK-NEXT: v30 = vor(v9,v13) -; CHECK-NEXT: q3 = vcmp.eq(v6.w,v15.w) +; CHECK-NEXT: v30 = vor(v14,v15) +; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q2,v15,v28) -; CHECK-NEXT: v31 = vmux(q3,v15,v30) +; CHECK-NEXT: v0 = vmux(q2,v5,v28) +; CHECK-NEXT: v31 = vmux(q3,v5,v30) ; CHECK-NEXT: vmem(r1+#3) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -740,25 +738,25 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #1 -; CHECK-NEXT: r3:2 = combine(#31,#64) +; CHECK-NEXT: r3:2 = combine(#31,#1) +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: v1.h = vabs(v0.h) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v2.h = vsplat(r2) ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.h = vsplat(r2) +; CHECK-NEXT: v6.h = vsplat(r7) ; CHECK-NEXT: r4 = ##32768 ; CHECK-NEXT: v4.uh = vcl0(v1.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r4) -; CHECK-NEXT: r3 = #10 +; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) ; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) ; CHECK-NEXT: } @@ -789,15 +787,15 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7) +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2) ; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7) +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.h = vasl(v28.h,r3) -; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v1.h = vasl(v28.h,r4) +; CHECK-NEXT: q3 = vsetq(r7) ; CHECK-NEXT: v2 = vmux(q3,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -835,9 +833,9 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r0) +; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: r7 = #512 -; CHECK-NEXT: v3.w = vabs(v0.w) +; CHECK-NEXT: v4.w = vabs(v0.w) ; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -849,60 +847,60 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v7.uw = vcl0(v3.uw) +; CHECK-NEXT: v7.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: v8.uw = vcl0(v6.uw) ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) -; CHECK-NEXT: v7.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v7.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v8.w = vadd(v8.w,v4.w) +; CHECK-NEXT: v8.w = vadd(v8.w,v3.w) ; CHECK-NEXT: v27 = vmux(q0,v10,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,v7.w) +; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,v8.w) -; CHECK-NEXT: v11.w = vadd(v3.w,v5.w) -; CHECK-NEXT: v12 = vand(v3,v9) +; CHECK-NEXT: v11.w = vadd(v4.w,v5.w) +; CHECK-NEXT: v12 = vand(v4,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v6.w,v5.w) ; CHECK-NEXT: v9 = vand(v6,v9) ; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w) -; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v11.uw) +; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2) ; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w) -; CHECK-NEXT: v23 = vmux(q1,v2,v4) -; CHECK-NEXT: v14 = vmux(q2,v4,v2) +; CHECK-NEXT: v23 = vmux(q1,v2,v3) +; CHECK-NEXT: v14 = vmux(q2,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v11.w = vadd(v22.w,v23.w) ; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw) -; CHECK-NEXT: v25 = vmux(q3,v2,v4) +; CHECK-NEXT: v25 = vmux(q3,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v3.uw,r2) +; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v5.w = vadd(v24.w,v25.w) -; CHECK-NEXT: v4 = vmux(q2,v4,v2) +; CHECK-NEXT: v3 = vmux(q2,v3,v2) ; CHECK-NEXT: v7.w = vsub(v14.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) +; CHECK-NEXT: v3.w = vsub(v3.w,v8.w) ; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w) ; CHECK-NEXT: v7.w = vadd(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) +; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v3.w = vadd(v3.w,v13.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -910,13 +908,13 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) -; CHECK-NEXT: v3 = vmux(q3,v11,v3) +; CHECK-NEXT: v4 = vmux(q3,v11,v4) ; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0) ; CHECK-NEXT: v28 = vmux(q3,v10,v2) -; CHECK-NEXT: v3 = vor(v27,v3) +; CHECK-NEXT: v4 = vor(v27,v4) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -925,17 +923,17 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v4.w,r4) +; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v5 = vor(v28,v5) -; CHECK-NEXT: v29 = vor(v3,v7) +; CHECK-NEXT: v29 = vor(v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vor(v5,v4) +; CHECK-NEXT: v3 = vor(v5,v3) ; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v2,v4) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1044,114 +1042,114 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v3.w = vabs(v1.w) +; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v2.w = vabs(v0.w) +; CHECK-NEXT: v5.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8 = vsplat(r6) -; CHECK-NEXT: v6.uw = vcl0(v3.uw) -; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: v3.uw = vcl0(v6.uw) +; CHECK-NEXT: v20 = vxor(v20,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 -; CHECK-NEXT: v5.uw = vcl0(v2.uw) -; CHECK-NEXT: v6.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v4.uw = vcl0(v5.uw) +; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vsplat(r4) ; CHECK-NEXT: r5 = ##-2147483648 -; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v3.w = vasl(v3.w,v6.w) -; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) -; CHECK-NEXT: v26 = vmux(q0,v13,v7) -; CHECK-NEXT: v10.w = vadd(v3.w,v8.w) -; CHECK-NEXT: v11 = vand(v3,v9) +; CHECK-NEXT: v6.w = vasl(v6.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9 = vand(v2,v9) -; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w) -; CHECK-NEXT: v8.w = vadd(v2.w,v8.w) -; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v10.uw) +; CHECK-NEXT: v5.w = vasl(v5.w,v7.w) +; CHECK-NEXT: v26 = vmux(q0,v13,v20) +; CHECK-NEXT: v10.w = vadd(v6.w,v8.w) +; CHECK-NEXT: v11 = vand(v6,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v3.uw,r3) -; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w) -; CHECK-NEXT: v22 = vmux(q1,v7,v4) -; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v8.uw) +; CHECK-NEXT: v9 = vand(v5,v9) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w) +; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) +; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v10.uw,r3) -; CHECK-NEXT: v24 = vmux(q3,v7,v4) -; CHECK-NEXT: v23 = vmux(q2,v4,v7) -; CHECK-NEXT: v4 = vmux(q1,v4,v7) +; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3) +; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w) +; CHECK-NEXT: v22 = vmux(q3,v20,v2) +; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: v9.w = vadd(v3.w,v22.w) -; CHECK-NEXT: v6.w = vsub(v23.w,v6.w) -; CHECK-NEXT: v4.w = vsub(v4.w,v5.w) +; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) +; CHECK-NEXT: v24 = vmux(q2,v20,v2) +; CHECK-NEXT: v23 = vmux(q1,v2,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.uw = vlsr(v2.uw,r3) +; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3) +; CHECK-NEXT: v2 = vmux(q3,v2,v20) ; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v3.w) -; CHECK-NEXT: v6.w = vadd(v6.w,v27.w) +; CHECK-NEXT: v3.w = vsub(v23.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) +; CHECK-NEXT: v2.w = vsub(v2.w,v7.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 -; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v2.w,v8.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v27.w) +; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2) -; CHECK-NEXT: v3 = vmux(q3,v9,v3) -; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w) +; CHECK-NEXT: v6 = vmux(q3,v9,v6) +; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v30 = vmux(q3,v13,v7) -; CHECK-NEXT: v3 = vor(v26,v3) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) +; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v13,v20) +; CHECK-NEXT: v6 = vor(v26,v6) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.w = vasl(v6.w,r3) -; CHECK-NEXT: v2 = vmux(q2,v28,v2) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: v3.w = vasl(v3.w,r3) +; CHECK-NEXT: v5 = vmux(q2,v28,v29) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v4.w,r3) -; CHECK-NEXT: v31 = vor(v30,v2) -; CHECK-NEXT: v3 = vor(v3,v29) +; CHECK-NEXT: v2.w = vasl(v2.w,r3) +; CHECK-NEXT: v31 = vor(v30,v5) +; CHECK-NEXT: v3 = vor(v6,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v7,v3) +; CHECK-NEXT: v3 = vmux(q2,v20,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v7,v1) +; CHECK-NEXT: v0 = vmux(q3,v20,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v7.sf) +; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf) +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 @@ -1454,60 +1452,60 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r6) -; CHECK-NEXT: v5.h = vsplat(r3) +; CHECK-NEXT: v3.h = vsplat(r6) +; CHECK-NEXT: v4.h = vsplat(r3) ; CHECK-NEXT: r5 = #64 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 -; CHECK-NEXT: v4.uh = vcl0(v0.uh) +; CHECK-NEXT: v5.uh = vcl0(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vcl0(v1.uh) -; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.h = vadd(v7.h,v2.h) +; CHECK-NEXT: v7.h = vadd(v7.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.h = vasl(v0.h,v4.h) +; CHECK-NEXT: v8.h = vasl(v0.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.h = vasl(v1.h,v7.h) ; CHECK-NEXT: v10 = vand(v8,v6) -; CHECK-NEXT: v9.h = vadd(v8.h,v5.h) +; CHECK-NEXT: v9.h = vadd(v8.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.h = vadd(v11.h,v5.h) +; CHECK-NEXT: v22.h = vadd(v11.h,v4.h) ; CHECK-NEXT: v6 = vand(v11,v6) ; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh) -; CHECK-NEXT: q1 = vcmp.eq(v10.h,v3.h) +; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2) -; CHECK-NEXT: q2 = vcmp.eq(v6.h,v3.h) +; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h) ; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh) -; CHECK-NEXT: v12 = vmux(q1,v3,v2) +; CHECK-NEXT: v12 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v3,v2) -; CHECK-NEXT: v25 = vmux(q0,v2,v3) -; CHECK-NEXT: v2 = vmux(q3,v2,v3) +; CHECK-NEXT: v13 = vmux(q2,v2,v3) +; CHECK-NEXT: v25 = vmux(q0,v3,v2) +; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2) ; CHECK-NEXT: v24.h = vadd(v9.h,v12.h) -; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) -; CHECK-NEXT: v12.h = vadd(v25.h,v5.h) +; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) +; CHECK-NEXT: v12.h = vadd(v25.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2) ; CHECK-NEXT: v13.h = vadd(v8.h,v13.h) -; CHECK-NEXT: v4.h = vsub(v12.h,v4.h) -; CHECK-NEXT: v2.h = vsub(v2.h,v7.h) +; CHECK-NEXT: v5.h = vsub(v12.h,v5.h) +; CHECK-NEXT: v3.h = vsub(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6) @@ -1519,28 +1517,28 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6) -; CHECK-NEXT: v5 = vmux(q2,v26,v14) -; CHECK-NEXT: q2 = vcmp.eq(v1.h,v3.h) +; CHECK-NEXT: v4 = vmux(q2,v26,v14) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.h = vasl(v4.h,r4) +; CHECK-NEXT: v5.h = vasl(v5.h,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,r4) -; CHECK-NEXT: v29 = vor(v5,v4) +; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v29 = vor(v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v2) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1635,6 +1633,7 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1643,19 +1642,16 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v15 = vsplat(r6) ; CHECK-NEXT: v6 = vsplat(r3) -; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13 = vsplat(r6) -; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) -; CHECK-NEXT: v15 = vxor(v15,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16 = vsplat(r5) -; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1670,153 +1666,153 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vcl0(v3.uw) -; CHECK-NEXT: v7.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v11.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v7 = vxor(v7,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vcl0(v1.uw) -; CHECK-NEXT: v8.w = vadd(v8.w,v4.w) +; CHECK-NEXT: v10.w = vadd(v8.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.w = vasl(v2.w,v5.w) -; CHECK-NEXT: v9.w = vadd(v9.w,v4.w) +; CHECK-NEXT: v9 = vsplat(r5) +; CHECK-NEXT: v14.w = vasl(v0.w,v11.w) +; CHECK-NEXT: v8.w = vadd(v9.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v0.w,v7.w) -; CHECK-NEXT: v19 = vand(v10,v13) -; CHECK-NEXT: v18.w = vadd(v10.w,v6.w) +; CHECK-NEXT: v12.w = vasl(v2.w,v5.w) +; CHECK-NEXT: v24 = vand(v14,v15) +; CHECK-NEXT: v20.w = vadd(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v3.w,v8.w) -; CHECK-NEXT: v24 = vand(v12,v13) -; CHECK-NEXT: q2 = vcmp.eq(v19.w,v15.w) -; CHECK-NEXT: v20.w = vadd(v12.w,v6.w) +; CHECK-NEXT: v13.w = vasl(v3.w,v10.w) +; CHECK-NEXT: v19 = vand(v12,v15) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) +; CHECK-NEXT: v18.w = vadd(v12.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.w = vasl(v1.w,v9.w) -; CHECK-NEXT: v23 = vand(v11,v13) -; CHECK-NEXT: v22.w = vadd(v11.w,v6.w) -; CHECK-NEXT: q3 = vcmp.eq(v24.w,v15.w) +; CHECK-NEXT: v16.w = vasl(v1.w,v8.w) +; CHECK-NEXT: v23 = vand(v13,v15) +; CHECK-NEXT: v22.w = vadd(v13.w,v6.w) +; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) -; CHECK-NEXT: v6.w = vadd(v14.w,v6.w) -; CHECK-NEXT: v13 = vand(v14,v13) -; CHECK-NEXT: v31 = vmux(q3,v15,v4) +; CHECK-NEXT: v6.w = vadd(v16.w,v6.w) +; CHECK-NEXT: v15 = vand(v16,v15) +; CHECK-NEXT: v30 = vmux(q3,v7,v4) +; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v12.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v13.w,v15.w) -; CHECK-NEXT: v28 = vmux(q2,v15,v4) -; CHECK-NEXT: q0 = vcmp.gt(v12.uw,v20.uw) +; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w) +; CHECK-NEXT: v28 = vmux(q0,v4,v7) +; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v20.uw,r2) -; CHECK-NEXT: q1 = vcmp.eq(v23.w,v15.w) -; CHECK-NEXT: v26 = vmux(q3,v15,v4) -; CHECK-NEXT: v23.w = vadd(v19.w,v28.w) +; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2) +; CHECK-NEXT: v26 = vmux(q3,v7,v4) +; CHECK-NEXT: v11.w = vsub(v28.w,v11.w) +; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v20.w = vadd(v12.w,v31.w) -; CHECK-NEXT: q3 = vcmp.gt(v11.uw,v22.uw) -; CHECK-NEXT: v31 = vmux(q1,v15,v4) +; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v20.w = vadd(v14.w,v30.w) +; CHECK-NEXT: v30 = vmux(q1,v7,v4) +; CHECK-NEXT: v31 = vmux(q2,v7,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) -; CHECK-NEXT: v30.w = vadd(v13.w,v26.w) -; CHECK-NEXT: q1 = vcmp.gt(v10.uw,v18.uw) -; CHECK-NEXT: v29 = vmux(q0,v4,v15) +; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) +; CHECK-NEXT: v29.w = vadd(v15.w,v26.w) +; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw) +; CHECK-NEXT: v11.w = vadd(v11.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) -; CHECK-NEXT: q0 = vcmp.eq(v21.w,v12.w) -; CHECK-NEXT: v22.w = vadd(v28.w,v31.w) -; CHECK-NEXT: v23 = vmux(q3,v4,v15) +; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) +; CHECK-NEXT: v23.w = vadd(v19.w,v31.w) +; CHECK-NEXT: v22 = vmux(q3,v4,v7) +; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v31 = vmux(q1,v4,v15) -; CHECK-NEXT: q3 = vcmp.gt(v14.uw,v6.uw) -; CHECK-NEXT: v30.w = vsub(v23.w,v8.w) +; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v31.w = vadd(v28.w,v30.w) +; CHECK-NEXT: v30 = vmux(q1,v4,v7) +; CHECK-NEXT: v4 = vmux(q3,v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vsub(v31.w,v5.w) -; CHECK-NEXT: v4 = vmux(q3,v4,v15) -; CHECK-NEXT: v7.w = vsub(v29.w,v7.w) -; CHECK-NEXT: v6.w = vadd(v30.w,v16.w) +; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2) +; CHECK-NEXT: v5.w = vsub(v30.w,v5.w) +; CHECK-NEXT: v29.w = vsub(v22.w,v10.w) +; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v17.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v4.w = vsub(v4.w,v9.w) -; CHECK-NEXT: v5.w = vadd(v5.w,v16.w) -; CHECK-NEXT: v7.w = vadd(v7.w,v16.w) +; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2) +; CHECK-NEXT: v6.w = vadd(v29.w,v9.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v9.w) +; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v16.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v25.uw = vlsr(v14.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v11.w,v28.w) +; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v12.uw,r0) -; CHECK-NEXT: q1 = vcmp.eq(v25.w,v13.w) +; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v19.uw,r0) +; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v23 = vmux(q2,v21,v29) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v15.w) +; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v23 = vmux(q2,v21,v23) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) -; CHECK-NEXT: v8 = vmux(q3,v12,v14) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v15.w) +; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0) +; CHECK-NEXT: v8 = vmux(q3,v31,v16) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) -; CHECK-NEXT: v20 = vmux(q0,v20,v27) +; CHECK-NEXT: v22 = vmux(q1,v24,v26) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v13.uw,r0) +; CHECK-NEXT: v5.w = vasl(v5.w,r4) ; CHECK-NEXT: v6 = vor(v8,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,r4) -; CHECK-NEXT: v22 = vmux(q1,v24,v26) -; CHECK-NEXT: v26 = vmux(q2,v15,v6) +; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v25 = vor(v23,v5) +; CHECK-NEXT: v26 = vmux(q2,v7,v6) ; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v7.w,r4) -; CHECK-NEXT: v25 = vor(v23,v5) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) +; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) +; CHECK-NEXT: v28 = vmux(q3,v7,v25) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.w = vasl(v4.w,r4) -; CHECK-NEXT: v28 = vmux(q3,v15,v25) -; CHECK-NEXT: v29 = vor(v20,v7) -; CHECK-NEXT: vmem(r1+#0) = v28.new +; CHECK-NEXT: v11.w = vasl(v11.w,r4) +; CHECK-NEXT: v20 = vmux(q0,v20,v27) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vor(v22,v24) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) +; CHECK-NEXT: v24.w = vasl(v4.w,r4) +; CHECK-NEXT: v29 = vor(v20,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v15,v27) -; CHECK-NEXT: v31 = vmux(q3,v15,v29) -; CHECK-NEXT: vmem(r1+#3) = v30.new +; CHECK-NEXT: v27 = vor(v22,v24) +; CHECK-NEXT: v31 = vmux(q3,v7,v29) +; CHECK-NEXT: vmem(r1+#2) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q2,v7,v27) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#2) = v31 +; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = uitofp <128 x i8> %v0 to <128 x float> @@ -1830,112 +1826,110 @@ define void @u8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmem(r0+#0) -; CHECK-NEXT: } -; CHECK-NEXT: { ; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) +; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) +; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v1 = vsplat(r7) ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) +; CHECK-NEXT: v21 = vxor(v21,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vcl0(v0.uw) +; CHECK-NEXT: v4.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vcl0(v1.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v5.uw = vcl0(v3.uw) +; CHECK-NEXT: v4.w = vadd(v4.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) +; CHECK-NEXT: v7.w = vasl(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) +; CHECK-NEXT: v9.w = vasl(v3.w,v5.w) ; CHECK-NEXT: v11 = vand(v7,v8) ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) -; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v21 = vmux(q0,v3,v2) -; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v6.uw) +; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v24 = vmux(q1,v21,v1) +; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w) +; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v22 = vmux(q1,v2,v3) -; CHECK-NEXT: v24 = vmux(q3,v3,v2) -; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v25 = vmux(q0,v1,v21) +; CHECK-NEXT: v27 = vmux(q3,v21,v1) +; CHECK-NEXT: v1 = vmux(q1,v1,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vsub(v22.w,v4.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) -; CHECK-NEXT: v10.w = vadd(v19.w,v21.w) -; CHECK-NEXT: v25.w = vadd(v20.w,v24.w) +; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) +; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) +; CHECK-NEXT: v10.w = vadd(v22.w,v24.w) +; CHECK-NEXT: v28.w = vadd(v23.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v13.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w) +; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7) -; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w) +; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7) +; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) +; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7) -; CHECK-NEXT: v5 = vmux(q2,v27,v11) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) +; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7) +; CHECK-NEXT: v5 = vmux(q2,v30,v11) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7) +; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) -; CHECK-NEXT: v6 = vmux(q3,v6,v26) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: v6 = vmux(q3,v6,v29) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v29 = vor(v5,v4) +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v31 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) -; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: v1 = vor(v6,v1) +; CHECK-NEXT: v0 = vmux(q3,v21,v31) +; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v28) +; CHECK-NEXT: v1 = vmux(q2,v21,v1) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#1) = v30.new +; CHECK-NEXT: vmem(r1+#1) = v1.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = uitofp <64 x i8> %v0 to <64 x float> @@ -2188,10 +2182,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3 = vsplat(r7) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8 = vsplat(r6) @@ -2202,10 +2196,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) ; CHECK-NEXT: v5.uw = vcl0(v1.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) @@ -2218,31 +2212,31 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) -; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w) +; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v8.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w) ; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw) -; CHECK-NEXT: v20 = vmux(q0,v3,v2) +; CHECK-NEXT: v20 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v22 = vmux(q2,v3,v2) -; CHECK-NEXT: v25 = vmux(q1,v2,v3) -; CHECK-NEXT: v2 = vmux(q3,v2,v3) +; CHECK-NEXT: v22 = vmux(q2,v2,v3) +; CHECK-NEXT: v25 = vmux(q0,v3,v2) +; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) +; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) ; CHECK-NEXT: v23.w = vadd(v19.w,v20.w) ; CHECK-NEXT: v10.w = vadd(v21.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v14.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v14.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2) @@ -2258,7 +2252,7 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: v5 = vmux(q2,v26,v13) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7) @@ -2266,19 +2260,19 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v2) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -2375,20 +2369,20 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v3.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v3.uw = vcl0(v2.uw) -; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: v4.uw = vcl0(v1.uw) +; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v6 = vsplat(r6) -; CHECK-NEXT: v3.w = vadd(v3.w,v4.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v4.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 @@ -2396,57 +2390,57 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r4) -; CHECK-NEXT: v5.w = vasl(v2.w,v3.w) +; CHECK-NEXT: v5.w = vasl(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,v1.w) +; CHECK-NEXT: v8.w = vasl(v0.w,v3.w) ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v13 = vand(v5,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) ; CHECK-NEXT: v7 = vand(v8,v7) -; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw) -; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w) +; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw) +; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v11.uw,r3) +; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) ; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) -; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w) -; CHECK-NEXT: v30 = vmux(q0,v4,v9) +; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w) +; CHECK-NEXT: v28 = vmux(q2,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) -; CHECK-NEXT: v29 = vmux(q1,v9,v4) -; CHECK-NEXT: v31 = vmux(q3,v4,v9) -; CHECK-NEXT: v4 = vmux(q2,v9,v4) +; CHECK-NEXT: v29 = vmux(q1,v2,v9) +; CHECK-NEXT: v30 = vmux(q3,v2,v9) +; CHECK-NEXT: v2 = vmux(q0,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vsub(v29.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) ; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) -; CHECK-NEXT: v7.w = vadd(v28.w,v29.w) -; CHECK-NEXT: v1.w = vsub(v31.w,v1.w) -; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) +; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v10.w) -; CHECK-NEXT: q2 = vcmp.eq(v2.w,v9.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v28.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v28.uw,r2) +; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) ; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) ; CHECK-NEXT: v5 = vmux(q3,v7,v5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w) ; CHECK-NEXT: } @@ -2454,16 +2448,16 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,r3) -; CHECK-NEXT: v2 = vmux(q1,v4,v6) +; CHECK-NEXT: v4.w = vasl(v4.w,r3) +; CHECK-NEXT: v31 = vmux(q1,v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r3) -; CHECK-NEXT: v3 = vor(v5,v3) +; CHECK-NEXT: v2.w = vasl(v3.w,r3) +; CHECK-NEXT: v4 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v2,v1) -; CHECK-NEXT: v3 = vmux(q2,v9,v3) +; CHECK-NEXT: v1 = vor(v31,v2) +; CHECK-NEXT: v3 = vmux(q2,v9,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v9,v1) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll index 2384ca4f95ec4..6fa0585843f46 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll @@ -14,15 +14,15 @@ define void @fred(<16 x i32> %a0, <16 x i32> %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r1:0 = combine(#-1,#32) ; CHECK-NEXT: v2 = vxor(v2,v2) -; CHECK-NEXT: q0 = vcmp.eq(v0.w,v1.w) +; CHECK-NEXT: q1 = vcmp.eq(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7 = ##g0 -; CHECK-NEXT: q1 = vsetq(r0) -; CHECK-NEXT: v0 = vmux(q0,v0,v2) +; CHECK-NEXT: q0 = vsetq(r0) +; CHECK-NEXT: v0 = vmux(q1,v0,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vand(q1,r1) +; CHECK-NEXT: v30 = vand(q0,r1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vpacke(v0.w,v0.w) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll index 7a79c7f981ae9..c18672ba0a833 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll @@ -321,58 +321,58 @@ define <64 x i32> @f10(<32 x i32> %a0, <32 x i32> %a1) #0 { ; V60-NEXT: r0 = ##33686018 ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: v3:2 = vcombine(v0,v1) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r2 = #16 +; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: r2 = #16 ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: v5 = vsplat(r0) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q1 = vcmp.gt(v4.w,v0.w) +; V60-NEXT: v4 = vxor(v4,v4) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q0 = vcmp.gt(v4.w,v1.w) +; V60-NEXT: v6.uw = vlsr(v0.uw,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v6.uw = vlsr(v2.uw,r2) +; V60-NEXT: q1 = vcmp.gt(v4.w,v3.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v30 = vmux(q1,v1,v4) +; V60-NEXT: q0 = vcmp.gt(v4.w,v2.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v5 = vdelta(v1,v5) +; V60-NEXT: v5 = vdelta(v2,v5) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: if (q0) v30.w += v0.w +; V60-NEXT: v2 = vmux(q1,v2,v4) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.uw = vmpy(v0.uh,v5.uh) +; V60-NEXT: if (q0) v2.w += v3.w ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) +; V60-NEXT: v9:8.uw = vmpy(v3.uh,v5.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v29.w = vadd(v8.w,v6.w) +; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v2.w += vasl(v8.w,r2) +; V60-NEXT: v31.w = vadd(v8.w,v6.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9.w += vasr(v29.w,r2) +; V60-NEXT: v0.w += vasl(v8.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v31.w = vadd(v3.w,v9.w) +; V60-NEXT: v9.w += vasr(v31.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v3.w = vsub(v31.w,v30.w) +; V60-NEXT: v1.w = vadd(v1.w,v9.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1:0 = vcombine(v3,v2) +; V60-NEXT: v1.w = vsub(v1.w,v2.w) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll b/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll index be2b7b4d60107..00e9cf25a7044 100644 --- a/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll +++ b/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll @@ -16,7 +16,6 @@ entry: %cmp199 = icmp eq i16 %call197, 0 br i1 %cmp199, label %if.then200, label %if.else201 -; CHECK: = add ; CHECK-DAG: [[R4:r[0-9]+]] = add ; CHECK-DAG: p0 = cmp.eq(r0,#0) ; CHECK: if (!p0) [[R3:r[0-9]+]] = add(r{{[0-9]+}},#3) diff --git a/llvm/test/CodeGen/Hexagon/ntstbit.ll b/llvm/test/CodeGen/Hexagon/ntstbit.ll index afd71c217cefb..00a2dcfcf2f47 100644 --- a/llvm/test/CodeGen/Hexagon/ntstbit.ll +++ b/llvm/test/CodeGen/Hexagon/ntstbit.ll @@ -7,7 +7,7 @@ define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 { ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { ; CHECK-NEXT: p0 = !tstbit(r1,r2) -; CHECK-NEXT: r17:16 = combine(r0,r1) +; CHECK-NEXT: r17:16 = combine(r1,r0) ; CHECK-NEXT: memd(r29+#-16) = r17:16 ; CHECK-NEXT: allocframe(#8) ; CHECK-NEXT: } // 8-byte Folded Spill @@ -28,8 +28,8 @@ define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 { ; CHECK-NEXT: .LBB0_3: // %b3 ; CHECK-NEXT: { ; CHECK-NEXT: call f3 -; CHECK-NEXT: r1 = add(r16,#2) -; CHECK-NEXT: r0 = r17 +; CHECK-NEXT: r1 = add(r17,#2) +; CHECK-NEXT: r0 = r16 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #0 diff --git a/llvm/test/CodeGen/Hexagon/signext-inreg.ll b/llvm/test/CodeGen/Hexagon/signext-inreg.ll index cd9d783586957..fe74fa0f9a0ee 100644 --- a/llvm/test/CodeGen/Hexagon/signext-inreg.ll +++ b/llvm/test/CodeGen/Hexagon/signext-inreg.ll @@ -141,34 +141,34 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: r11:10 = memd(r29+#88) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r9:8,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) -; CHECK-NEXT: r9:8 = memd(r29+#80) +; CHECK-NEXT: r13:12 = memd(r29+#80) ; CHECK-NEXT: r7:6 = memd(r29+#104) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r15:14 = vaslh(r7:6,#8) -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) ; CHECK-NEXT: r7:6 = memd(r29+#72) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r15:14 = memd(r29+#64) ; CHECK-NEXT: memd(r0+#120) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) ; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) -; CHECK-NEXT: r13:12 = memd(r29+#56) -; CHECK-NEXT: memd(r0+#112) = r13:12 +; CHECK-NEXT: r9:8 = memd(r29+#56) +; CHECK-NEXT: memd(r0+#112) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: r11:10 = memd(r29+#48) ; CHECK-NEXT: memd(r0+#104) = r11:10 @@ -176,29 +176,29 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) -; CHECK-NEXT: r9:8 = memd(r29+#40) -; CHECK-NEXT: memd(r0+#96) = r9:8 +; CHECK-NEXT: r13:12 = memd(r29+#40) +; CHECK-NEXT: memd(r0+#96) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: r7:6 = memd(r29+#32) ; CHECK-NEXT: memd(r0+#88) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r15:14 = memd(r29+#0) ; CHECK-NEXT: memd(r0+#80) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) ; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) -; CHECK-NEXT: r13:12 = memd(r29+#16) -; CHECK-NEXT: memd(r0+#72) = r13:12 +; CHECK-NEXT: r9:8 = memd(r29+#16) +; CHECK-NEXT: memd(r0+#72) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: r11:10 = memd(r29+#24) ; CHECK-NEXT: memd(r0+#64) = r11:10 @@ -206,29 +206,29 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r3:2 = vasrh(r3:2,#8) -; CHECK-NEXT: r9:8 = memd(r29+#8) -; CHECK-NEXT: memd(r0+#56) = r9:8 +; CHECK-NEXT: r13:12 = memd(r29+#8) +; CHECK-NEXT: memd(r0+#56) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: memd(r0+#48) = r7:6 ; CHECK-NEXT: memd(r0+#0) = r3:2 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) ; CHECK-NEXT: r7:6 = vasrh(r15:14,#8) -; CHECK-NEXT: memd(r0+#32) = r13:12 +; CHECK-NEXT: memd(r0+#32) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r5:4 = vasrh(r5:4,#8) ; CHECK-NEXT: memd(r0+#40) = r11:10 ; CHECK-NEXT: memd(r0+#16) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: memd(r0+#24) = r9:8 +; CHECK-NEXT: memd(r0+#24) = r13:12 ; CHECK-NEXT: memd(r0+#8) = r5:4 ; CHECK-NEXT: } ; diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 91b9ff36d29ab..1562f1872ceb7 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the -; inner loop has 13 packets. +; inner loop has 14 packets. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: @@ -17,6 +17,7 @@ ; CHECK: } ; CHECK: } ; CHECK: } +; CHECK: } ; CHECK-NOT: } ; CHECK: }{{[ \t]*}}:endloop0 diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll index ea88b79de9369..5377dc4d13abd 100644 --- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll +++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll @@ -6,6 +6,7 @@ ; CHECK: = and ; CHECK: = and ; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) +; CHECK: = and ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]], diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll index c71a6908fae5c..093253bf8f691 100644 --- a/llvm/test/CodeGen/PowerPC/all-atomics.ll +++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll @@ -4688,13 +4688,13 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: rlwinm 20, 29, 0, 0, 29 -; AIX32-NEXT: xori 24, 5, 24 -; AIX32-NEXT: slw 5, 3, 24 +; AIX32-NEXT: xori 25, 5, 24 +; AIX32-NEXT: slw 5, 3, 25 ; AIX32-NEXT: stb 3, 0(28) ; AIX32-NEXT: li 3, 255 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 4, 24 -; AIX32-NEXT: slw 3, 3, 24 +; AIX32-NEXT: slw 6, 4, 25 +; AIX32-NEXT: slw 3, 3, 25 ; AIX32-NEXT: and 4, 5, 3 ; AIX32-NEXT: and 5, 6, 3 ; AIX32-NEXT: L..BB3_4: # %entry @@ -4711,7 +4711,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_4 ; AIX32-NEXT: L..BB3_6: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: srw 4, 6, 24 +; AIX32-NEXT: srw 4, 6, 25 ; AIX32-NEXT: lbz 3, 0(28) ; AIX32-NEXT: extsb 5, 3 ; AIX32-NEXT: lwz 3, L..C2(2) # @ss @@ -4750,12 +4750,12 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 3, L..C3(2) # @us ; AIX32-NEXT: rlwinm 6, 3, 3, 27, 27 ; AIX32-NEXT: rlwinm 19, 3, 0, 0, 29 -; AIX32-NEXT: xori 23, 6, 16 -; AIX32-NEXT: slw 6, 4, 23 +; AIX32-NEXT: xori 24, 6, 16 +; AIX32-NEXT: slw 6, 4, 24 ; AIX32-NEXT: li 4, 0 -; AIX32-NEXT: slw 5, 5, 23 +; AIX32-NEXT: slw 5, 5, 24 ; AIX32-NEXT: ori 4, 4, 65535 -; AIX32-NEXT: slw 4, 4, 23 +; AIX32-NEXT: slw 4, 4, 24 ; AIX32-NEXT: and 5, 5, 4 ; AIX32-NEXT: and 6, 6, 4 ; AIX32-NEXT: L..BB3_10: # %entry @@ -4771,7 +4771,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 8, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_10 ; AIX32-NEXT: L..BB3_12: # %entry -; AIX32-NEXT: srw 4, 7, 23 +; AIX32-NEXT: srw 4, 7, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: lwz 17, L..C4(2) # @si ; AIX32-NEXT: sth 4, 0(3) @@ -4810,11 +4810,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 31, L..C6(2) # @sll ; AIX32-NEXT: stw 5, 0(27) ; AIX32-NEXT: lbz 3, 0(28) -; AIX32-NEXT: li 25, 0 +; AIX32-NEXT: li 23, 0 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: extsb 6, 3 ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: srawi 5, 6, 31 @@ -4832,7 +4832,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: srawi 5, 6, 31 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: stw 3, 0(31) ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: stw 3, 60(1) @@ -4870,14 +4870,14 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: cmpw 4, 3 ; AIX32-NEXT: li 3, 1 -; AIX32-NEXT: iseleq 4, 3, 25 -; AIX32-NEXT: slw 6, 5, 24 +; AIX32-NEXT: iseleq 4, 3, 23 +; AIX32-NEXT: slw 6, 5, 25 ; AIX32-NEXT: li 5, 255 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) -; AIX32-NEXT: slw 5, 5, 24 +; AIX32-NEXT: slw 5, 5, 25 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 7, 4, 24 +; AIX32-NEXT: slw 7, 4, 25 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_22: # %entry @@ -4893,11 +4893,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 9, 0, 20 ; AIX32-NEXT: bne 0, L..BB3_22 ; AIX32-NEXT: L..BB3_24: # %entry -; AIX32-NEXT: srw 5, 8, 24 +; AIX32-NEXT: srw 5, 8, 25 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) @@ -4926,16 +4926,16 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 5, 23 +; AIX32-NEXT: slw 6, 5, 24 ; AIX32-NEXT: li 5, 0 -; AIX32-NEXT: slw 7, 4, 23 +; AIX32-NEXT: slw 7, 4, 24 ; AIX32-NEXT: ori 5, 5, 65535 -; AIX32-NEXT: slw 5, 5, 23 +; AIX32-NEXT: slw 5, 5, 24 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_28: # %entry @@ -4951,11 +4951,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 9, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_28 ; AIX32-NEXT: L..BB3_30: # %entry -; AIX32-NEXT: srw 5, 8, 23 +; AIX32-NEXT: srw 5, 8, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync @@ -4971,7 +4971,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_31 ; AIX32-NEXT: L..BB3_33: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 4, 3, 25, 6 +; AIX32-NEXT: isel 4, 3, 23, 6 ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) @@ -4988,13 +4988,13 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_34 ; AIX32-NEXT: L..BB3_36: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 3, 3, 25, 6 +; AIX32-NEXT: isel 3, 3, 23, 6 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 ; AIX32-NEXT: lbz 4, 0(28) ; AIX32-NEXT: stw 3, 0(27) ; AIX32-NEXT: lbz 3, 0(29) -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) @@ -5011,7 +5011,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) ; AIX32-NEXT: mr 3, 30 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: srawi 5, 6, 31 ; AIX32-NEXT: bl .__atomic_compare_exchange_8[PR] ; AIX32-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 57992cff28c62..23ff5f6926916 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 ; This is already checked for in Atomics-64.ll ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64 @@ -368,48 +368,48 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) { define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) { ; PPC32-LABEL: xor_i16_seq_cst: ; PPC32: # %bb.0: -; PPC32-NEXT: li r6, 0 -; PPC32-NEXT: rlwinm r7, r3, 3, 27, 27 -; PPC32-NEXT: rlwinm r5, r3, 0, 0, 29 -; PPC32-NEXT: ori r6, r6, 65535 -; PPC32-NEXT: xori r3, r7, 16 -; PPC32-NEXT: slw r4, r4, r3 -; PPC32-NEXT: slw r6, r6, r3 +; PPC32-NEXT: li r5, 0 +; PPC32-NEXT: rlwinm r6, r3, 3, 27, 27 +; PPC32-NEXT: ori r7, r5, 65535 +; PPC32-NEXT: xori r5, r6, 16 +; PPC32-NEXT: rlwinm r3, r3, 0, 0, 29 +; PPC32-NEXT: slw r4, r4, r5 +; PPC32-NEXT: slw r6, r7, r5 ; PPC32-NEXT: sync ; PPC32-NEXT: .LBB13_1: -; PPC32-NEXT: lwarx r7, 0, r5 +; PPC32-NEXT: lwarx r7, 0, r3 ; PPC32-NEXT: xor r8, r4, r7 ; PPC32-NEXT: andc r9, r7, r6 ; PPC32-NEXT: and r8, r8, r6 ; PPC32-NEXT: or r8, r8, r9 -; PPC32-NEXT: stwcx. r8, 0, r5 +; PPC32-NEXT: stwcx. r8, 0, r3 ; PPC32-NEXT: bne cr0, .LBB13_1 ; PPC32-NEXT: # %bb.2: -; PPC32-NEXT: srw r3, r7, r3 +; PPC32-NEXT: srw r3, r7, r5 ; PPC32-NEXT: clrlwi r3, r3, 16 ; PPC32-NEXT: lwsync ; PPC32-NEXT: blr ; ; PPC64-LABEL: xor_i16_seq_cst: ; PPC64: # %bb.0: -; PPC64-NEXT: li r6, 0 -; PPC64-NEXT: rlwinm r7, r3, 3, 27, 27 -; PPC64-NEXT: rldicr r5, r3, 0, 61 -; PPC64-NEXT: ori r6, r6, 65535 -; PPC64-NEXT: xori r3, r7, 16 -; PPC64-NEXT: slw r4, r4, r3 -; PPC64-NEXT: slw r6, r6, r3 +; PPC64-NEXT: li r5, 0 +; PPC64-NEXT: rlwinm r6, r3, 3, 27, 27 +; PPC64-NEXT: ori r7, r5, 65535 +; PPC64-NEXT: xori r5, r6, 16 +; PPC64-NEXT: rldicr r3, r3, 0, 61 +; PPC64-NEXT: slw r4, r4, r5 +; PPC64-NEXT: slw r6, r7, r5 ; PPC64-NEXT: sync ; PPC64-NEXT: .LBB13_1: -; PPC64-NEXT: lwarx r7, 0, r5 +; PPC64-NEXT: lwarx r7, 0, r3 ; PPC64-NEXT: xor r8, r4, r7 ; PPC64-NEXT: andc r9, r7, r6 ; PPC64-NEXT: and r8, r8, r6 ; PPC64-NEXT: or r8, r8, r9 -; PPC64-NEXT: stwcx. r8, 0, r5 +; PPC64-NEXT: stwcx. r8, 0, r3 ; PPC64-NEXT: bne cr0, .LBB13_1 ; PPC64-NEXT: # %bb.2: -; PPC64-NEXT: srw r3, r7, r3 +; PPC64-NEXT: srw r3, r7, r5 ; PPC64-NEXT: clrlwi r3, r3, 16 ; PPC64-NEXT: lwsync ; PPC64-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 0b06d7ed586bf..c6d6f6a17b1b5 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -66,88 +66,89 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 7, 21, 7 -; PPC32-NEXT: lbz 23, 115(1) +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: add 5, 23, 5 -; PPC32-NEXT: lbz 23, 127(1) -; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 21, 123(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: add 5, 22, 6 ; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 10, 21, 10 -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: add 8, 23, 8 -; PPC32-NEXT: lbz 26, 83(1) +; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: lbz 21, 135(1) +; PPC32-NEXT: addi 6, 6, 1 +; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill ; PPC32-NEXT: add 9, 22, 9 +; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 10, 21, 10 +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: lbz 25, 83(1) +; PPC32-NEXT: add 7, 20, 8 ; PPC32-NEXT: lbz 21, 147(1) +; PPC32-NEXT: addi 7, 7, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: add 26, 21, 26 -; PPC32-NEXT: lbz 25, 79(1) -; PPC32-NEXT: lbz 24, 75(1) -; PPC32-NEXT: lbz 23, 139(1) +; PPC32-NEXT: addi 4, 4, 1 +; PPC32-NEXT: lbz 24, 79(1) +; PPC32-NEXT: add 25, 21, 25 ; PPC32-NEXT: lbz 22, 143(1) -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: add 24, 23, 24 -; PPC32-NEXT: lbz 29, 95(1) -; PPC32-NEXT: add 25, 22, 25 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 23, 75(1) +; PPC32-NEXT: add 24, 22, 24 +; PPC32-NEXT: lbz 8, 139(1) +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 28, 95(1) +; PPC32-NEXT: add 8, 8, 23 ; PPC32-NEXT: lbz 21, 159(1) +; PPC32-NEXT: addi 8, 8, 1 ; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: add 29, 21, 29 -; PPC32-NEXT: lbz 28, 91(1) -; PPC32-NEXT: lbz 27, 87(1) -; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 27, 91(1) +; PPC32-NEXT: add 28, 21, 28 ; PPC32-NEXT: lbz 22, 155(1) -; PPC32-NEXT: lbz 4, 111(1) -; PPC32-NEXT: add 27, 23, 27 +; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 26, 87(1) +; PPC32-NEXT: add 27, 22, 27 +; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 11, 111(1) ; PPC32-NEXT: lbz 21, 175(1) -; PPC32-NEXT: add 28, 22, 28 -; PPC32-NEXT: lbz 11, 107(1) -; PPC32-NEXT: lbz 12, 171(1) -; PPC32-NEXT: add 4, 21, 4 +; PPC32-NEXT: add 26, 23, 26 +; PPC32-NEXT: lbz 12, 107(1) +; PPC32-NEXT: lbz 0, 171(1) +; PPC32-NEXT: add 11, 21, 11 ; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 4, 4, 1 -; PPC32-NEXT: lbz 0, 103(1) -; PPC32-NEXT: add 11, 12, 11 -; PPC32-NEXT: lbz 30, 99(1) -; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: addi 11, 11, 1 +; PPC32-NEXT: lbz 30, 103(1) +; PPC32-NEXT: add 12, 0, 12 ; PPC32-NEXT: lbz 22, 167(1) -; PPC32-NEXT: add 30, 23, 30 -; PPC32-NEXT: stb 4, 15(3) -; PPC32-NEXT: add 23, 22, 0 -; PPC32-NEXT: addi 4, 11, 1 -; PPC32-NEXT: stb 4, 14(3) -; PPC32-NEXT: addi 4, 23, 1 -; PPC32-NEXT: stb 4, 13(3) -; PPC32-NEXT: addi 4, 30, 1 -; PPC32-NEXT: stb 4, 12(3) -; PPC32-NEXT: addi 4, 29, 1 -; PPC32-NEXT: stb 4, 11(3) -; PPC32-NEXT: addi 4, 28, 1 -; PPC32-NEXT: stb 4, 10(3) -; PPC32-NEXT: addi 4, 27, 1 -; PPC32-NEXT: stb 4, 9(3) -; PPC32-NEXT: addi 4, 26, 1 -; PPC32-NEXT: stb 4, 8(3) -; PPC32-NEXT: addi 4, 25, 1 -; PPC32-NEXT: stb 4, 7(3) -; PPC32-NEXT: addi 4, 24, 1 -; PPC32-NEXT: stb 4, 6(3) -; PPC32-NEXT: addi 4, 10, 1 -; PPC32-NEXT: stb 4, 5(3) -; PPC32-NEXT: addi 4, 9, 1 -; PPC32-NEXT: stb 4, 4(3) -; PPC32-NEXT: addi 4, 8, 1 -; PPC32-NEXT: stb 4, 3(3) -; PPC32-NEXT: addi 4, 7, 1 -; PPC32-NEXT: stb 4, 2(3) -; PPC32-NEXT: addi 4, 6, 1 -; PPC32-NEXT: stb 4, 1(3) -; PPC32-NEXT: addi 4, 5, 1 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 29, 99(1) +; PPC32-NEXT: add 30, 22, 30 +; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: stb 11, 15(3) +; PPC32-NEXT: addi 11, 12, 1 +; PPC32-NEXT: add 29, 23, 29 +; PPC32-NEXT: stb 11, 14(3) +; PPC32-NEXT: addi 11, 30, 1 +; PPC32-NEXT: stb 11, 13(3) +; PPC32-NEXT: addi 11, 29, 1 +; PPC32-NEXT: stb 11, 12(3) +; PPC32-NEXT: addi 11, 28, 1 +; PPC32-NEXT: stb 11, 11(3) +; PPC32-NEXT: addi 11, 27, 1 +; PPC32-NEXT: stb 11, 10(3) +; PPC32-NEXT: addi 11, 26, 1 +; PPC32-NEXT: stb 11, 9(3) +; PPC32-NEXT: addi 11, 25, 1 +; PPC32-NEXT: stb 8, 6(3) +; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 11, 8(3) +; PPC32-NEXT: addi 11, 24, 1 +; PPC32-NEXT: stb 8, 5(3) +; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 11, 7(3) +; PPC32-NEXT: stb 8, 4(3) +; PPC32-NEXT: stb 7, 3(3) +; PPC32-NEXT: stb 6, 2(3) +; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload @@ -159,6 +160,7 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir index 1cc50ab4dcdd5..b9c541feae5ac 100644 --- a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir +++ b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir @@ -11,10 +11,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: foo ; CHECK: liveins: $x3, $x4 - ; CHECK: early-clobber renamable $g8p3 = LQ 128, $x4 - ; CHECK: $x3 = OR8 $x7, $x7 - ; CHECK: STQ killed renamable $g8p3, 160, $x3 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber renamable $g8p3 = LQ 128, $x4 + ; CHECK-NEXT: $x3 = OR8 $x7, $x7 + ; CHECK-NEXT: STQ killed renamable $g8p3, 160, $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8prc = LQ 128, $x4 $x3 = COPY %0.sub_gp8_x1:g8prc STQ %0, 160, $x3 @@ -30,10 +31,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: foobar ; CHECK: liveins: $x3, $x4 - ; CHECK: renamable $g8p3 = LQARX $x3, $x4 - ; CHECK: STQCX renamable $g8p3, $x3, $x4, implicit-def dead $cr0 - ; CHECK: $x3 = OR8 $x7, killed $x7 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $g8p3 = LQARX $x3, $x4 + ; CHECK-NEXT: STQCX renamable $g8p3, $x3, $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x3 = OR8 $x7, killed $x7 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8prc = LQARX $x3, $x4 STQCX %0:g8prc, $x3, $x4, implicit-def $cr0 $x3 = COPY %0.sub_gp8_x1:g8prc @@ -49,10 +51,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: bar ; CHECK: liveins: $x3, $x4 - ; CHECK: early-clobber renamable $g8p2 = LQ 128, renamable $x3 - ; CHECK: STQ renamable $g8p2, 160, $x3 - ; CHECK: $x3 = OR8 $x4, killed $x4 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber renamable $g8p2 = LQ 128, renamable $x3 + ; CHECK-NEXT: STQ renamable $g8p2, 160, $x3 + ; CHECK-NEXT: $x3 = OR8 $x4, killed $x4 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8rc_nox0 = COPY $x3 %1:g8prc = LQ 128, %0 STQ %1, 160, $x3 @@ -71,97 +74,98 @@ body: | liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12 ; CHECK-LABEL: name: spill_g8prc ; CHECK: liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31 - ; CHECK: STD killed $x14, -144, $x1 :: (store (s64) into %fixed-stack.17, align 16) - ; CHECK: STD killed $x15, -136, $x1 :: (store (s64) into %fixed-stack.16) - ; CHECK: STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.15, align 16) - ; CHECK: STD killed $x17, -120, $x1 :: (store (s64) into %fixed-stack.14) - ; CHECK: STD killed $x18, -112, $x1 :: (store (s64) into %fixed-stack.13, align 16) - ; CHECK: STD killed $x19, -104, $x1 :: (store (s64) into %fixed-stack.12) - ; CHECK: STD killed $x20, -96, $x1 :: (store (s64) into %fixed-stack.11, align 16) - ; CHECK: STD killed $x21, -88, $x1 :: (store (s64) into %fixed-stack.10) - ; CHECK: STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16) - ; CHECK: STD killed $x23, -72, $x1 :: (store (s64) into %fixed-stack.8) - ; CHECK: STD killed $x24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16) - ; CHECK: STD killed $x25, -56, $x1 :: (store (s64) into %fixed-stack.6) - ; CHECK: STD killed $x26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16) - ; CHECK: STD killed $x27, -40, $x1 :: (store (s64) into %fixed-stack.4) - ; CHECK: STD killed $x28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16) - ; CHECK: STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.2) - ; CHECK: STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16) - ; CHECK: STD killed $x31, -8, $x1 :: (store (s64) into %fixed-stack.0) - ; CHECK: $x7 = OR8 $x3, $x3 - ; CHECK: renamable $g8p4 = LQARX $x5, $x6 - ; CHECK: STD killed $x8, -160, $x1 - ; CHECK: STD killed $x9, -152, $x1 - ; CHECK: renamable $g8p5 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -176, $x1 - ; CHECK: STD killed $x9, -168, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -192, $x1 - ; CHECK: STD killed $x9, -184, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -208, $x1 - ; CHECK: STD killed $x9, -200, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -224, $x1 - ; CHECK: STD killed $x9, -216, $x1 - ; CHECK: renamable $g8p10 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p9 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p8 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p7 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p15 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p11 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p12 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p13 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p14 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: $x3 = OR8 $x11, $x11 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p14, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p13, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p12, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p11, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p15, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p7, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p8, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p9, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p10, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -224, $x1 - ; CHECK: $x9 = LD -216, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -208, $x1 - ; CHECK: $x9 = LD -200, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -192, $x1 - ; CHECK: $x9 = LD -184, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -176, $x1 - ; CHECK: $x9 = LD -168, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p5, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -160, $x1 - ; CHECK: $x9 = LD -152, $x1 - ; CHECK: STQCX killed renamable $g8p4, $x5, $x6, implicit-def dead $cr0 - ; CHECK: $x31 = LD -8, $x1 :: (load (s64) from %fixed-stack.0) - ; CHECK: $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16) - ; CHECK: $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.2) - ; CHECK: $x28 = LD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16) - ; CHECK: $x27 = LD -40, $x1 :: (load (s64) from %fixed-stack.4) - ; CHECK: $x26 = LD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16) - ; CHECK: $x25 = LD -56, $x1 :: (load (s64) from %fixed-stack.6) - ; CHECK: $x24 = LD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16) - ; CHECK: $x23 = LD -72, $x1 :: (load (s64) from %fixed-stack.8) - ; CHECK: $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16) - ; CHECK: $x21 = LD -88, $x1 :: (load (s64) from %fixed-stack.10) - ; CHECK: $x20 = LD -96, $x1 :: (load (s64) from %fixed-stack.11, align 16) - ; CHECK: $x19 = LD -104, $x1 :: (load (s64) from %fixed-stack.12) - ; CHECK: $x18 = LD -112, $x1 :: (load (s64) from %fixed-stack.13, align 16) - ; CHECK: $x17 = LD -120, $x1 :: (load (s64) from %fixed-stack.14) - ; CHECK: $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.15, align 16) - ; CHECK: $x15 = LD -136, $x1 :: (load (s64) from %fixed-stack.16) - ; CHECK: $x14 = LD -144, $x1 :: (load (s64) from %fixed-stack.17, align 16) - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STD killed $x14, -144, $x1 :: (store (s64) into %fixed-stack.17, align 16) + ; CHECK-NEXT: STD killed $x15, -136, $x1 :: (store (s64) into %fixed-stack.16) + ; CHECK-NEXT: STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.15, align 16) + ; CHECK-NEXT: STD killed $x17, -120, $x1 :: (store (s64) into %fixed-stack.14) + ; CHECK-NEXT: STD killed $x18, -112, $x1 :: (store (s64) into %fixed-stack.13, align 16) + ; CHECK-NEXT: STD killed $x19, -104, $x1 :: (store (s64) into %fixed-stack.12) + ; CHECK-NEXT: STD killed $x20, -96, $x1 :: (store (s64) into %fixed-stack.11, align 16) + ; CHECK-NEXT: STD killed $x21, -88, $x1 :: (store (s64) into %fixed-stack.10) + ; CHECK-NEXT: STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16) + ; CHECK-NEXT: STD killed $x23, -72, $x1 :: (store (s64) into %fixed-stack.8) + ; CHECK-NEXT: STD killed $x24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16) + ; CHECK-NEXT: STD killed $x25, -56, $x1 :: (store (s64) into %fixed-stack.6) + ; CHECK-NEXT: STD killed $x26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16) + ; CHECK-NEXT: STD killed $x27, -40, $x1 :: (store (s64) into %fixed-stack.4) + ; CHECK-NEXT: STD killed $x28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16) + ; CHECK-NEXT: STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.2) + ; CHECK-NEXT: STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16) + ; CHECK-NEXT: STD killed $x31, -8, $x1 :: (store (s64) into %fixed-stack.0) + ; CHECK-NEXT: $x7 = OR8 $x3, $x3 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x5, $x6 + ; CHECK-NEXT: STD killed $x8, -160, $x1 + ; CHECK-NEXT: STD killed $x9, -152, $x1 + ; CHECK-NEXT: renamable $g8p13 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -176, $x1 + ; CHECK-NEXT: STD killed $x9, -168, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -192, $x1 + ; CHECK-NEXT: STD killed $x9, -184, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -208, $x1 + ; CHECK-NEXT: STD killed $x9, -200, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -224, $x1 + ; CHECK-NEXT: STD killed $x9, -216, $x1 + ; CHECK-NEXT: renamable $g8p10 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p9 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p8 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p7 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p15 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p11 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p12 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p14 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p5 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: $x3 = OR8 $x27, $x27 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p5, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p14, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p12, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p11, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p15, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p7, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p8, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p9, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p10, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -224, $x1 + ; CHECK-NEXT: $x9 = LD -216, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -208, $x1 + ; CHECK-NEXT: $x9 = LD -200, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -192, $x1 + ; CHECK-NEXT: $x9 = LD -184, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -176, $x1 + ; CHECK-NEXT: $x9 = LD -168, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p13, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -160, $x1 + ; CHECK-NEXT: $x9 = LD -152, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, $x5, $x6, implicit-def dead $cr0 + ; CHECK-NEXT: $x31 = LD -8, $x1 :: (load (s64) from %fixed-stack.0) + ; CHECK-NEXT: $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16) + ; CHECK-NEXT: $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.2) + ; CHECK-NEXT: $x28 = LD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16) + ; CHECK-NEXT: $x27 = LD -40, $x1 :: (load (s64) from %fixed-stack.4) + ; CHECK-NEXT: $x26 = LD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16) + ; CHECK-NEXT: $x25 = LD -56, $x1 :: (load (s64) from %fixed-stack.6) + ; CHECK-NEXT: $x24 = LD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16) + ; CHECK-NEXT: $x23 = LD -72, $x1 :: (load (s64) from %fixed-stack.8) + ; CHECK-NEXT: $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16) + ; CHECK-NEXT: $x21 = LD -88, $x1 :: (load (s64) from %fixed-stack.10) + ; CHECK-NEXT: $x20 = LD -96, $x1 :: (load (s64) from %fixed-stack.11, align 16) + ; CHECK-NEXT: $x19 = LD -104, $x1 :: (load (s64) from %fixed-stack.12) + ; CHECK-NEXT: $x18 = LD -112, $x1 :: (load (s64) from %fixed-stack.13, align 16) + ; CHECK-NEXT: $x17 = LD -120, $x1 :: (load (s64) from %fixed-stack.14) + ; CHECK-NEXT: $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.15, align 16) + ; CHECK-NEXT: $x15 = LD -136, $x1 :: (load (s64) from %fixed-stack.16) + ; CHECK-NEXT: $x14 = LD -144, $x1 :: (load (s64) from %fixed-stack.17, align 16) + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %addr0:g8rc_nox0 = COPY $x3 %addr1:g8rc = COPY $x4 %0:g8prc = LQARX $x5, $x6 @@ -209,10 +213,11 @@ body: | liveins: $g8p8 ; CHECK-LABEL: name: copy_g8prc ; CHECK: liveins: $g8p8 - ; CHECK: $x4 = OR8 $x16, $x16 - ; CHECK: $x5 = OR8 $x17, $x17 - ; CHECK: $x3 = OR8 $x5, $x5 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit killed $x3, implicit $x4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x4 = OR8 $x16, $x16 + ; CHECK-NEXT: $x5 = OR8 $x17, $x17 + ; CHECK-NEXT: $x3 = OR8 $x5, $x5 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit killed $x3, implicit $x4 %0:g8prc = COPY $g8p8 $x3 = COPY %0.sub_gp8_x1:g8prc $x4 = COPY %0.sub_gp8_x0:g8prc diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index 37baef6043884..900069c6216bf 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -196,13 +196,13 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: # %bb.1: # %bb3.preheader ; CHECK-NEXT: cmpldi r4, 1 ; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: addi r9, r3, 4002 +; CHECK-NEXT: addi r10, r3, 4002 ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r6, -1 ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r7, 3 ; CHECK-NEXT: li r8, 5 -; CHECK-NEXT: li r10, 9 +; CHECK-NEXT: li r9, 9 ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill @@ -213,17 +213,17 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_2: # %bb3 ; CHECK-NEXT: # -; CHECK-NEXT: ldx r11, r9, r6 -; CHECK-NEXT: ld r12, 0(r9) -; CHECK-NEXT: ldx r0, r9, r5 -; CHECK-NEXT: ldx r30, r9, r7 +; CHECK-NEXT: ldx r11, r10, r6 +; CHECK-NEXT: ld r12, 0(r10) +; CHECK-NEXT: ldx r0, r10, r5 +; CHECK-NEXT: ldx r30, r10, r7 ; CHECK-NEXT: mulld r11, r12, r11 -; CHECK-NEXT: ld r29, 4(r9) -; CHECK-NEXT: ldx r28, r9, r8 -; CHECK-NEXT: ld r27, 12(r9) -; CHECK-NEXT: ld r26, 8(r9) -; CHECK-NEXT: ldx r25, r9, r10 -; CHECK-NEXT: addi r9, r9, 1 +; CHECK-NEXT: ld r29, 4(r10) +; CHECK-NEXT: ldx r28, r10, r8 +; CHECK-NEXT: ld r27, 12(r10) +; CHECK-NEXT: ld r26, 8(r10) +; CHECK-NEXT: ldx r25, r10, r9 +; CHECK-NEXT: addi r10, r10, 1 ; CHECK-NEXT: mulld r11, r11, r0 ; CHECK-NEXT: mulld r11, r11, r30 ; CHECK-NEXT: mulld r11, r11, r29 diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll index 5d2232319c1f5..9f62477ae01df 100644 --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -60,122 +60,121 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: std 23, 472(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 22, 5 ; CHECK-NEXT: ld 5, 848(1) +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: mr 11, 7 ; CHECK-NEXT: ld 23, 688(1) -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: ld 2, 760(1) -; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 28, 824(1) ; CHECK-NEXT: ld 7, 728(1) ; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill ; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 18, 6 -; CHECK-NEXT: ld 6, 712(1) -; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: li 6, 9 ; CHECK-NEXT: ld 19, 768(1) -; CHECK-NEXT: std 10, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: std 6, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 840(1) -; CHECK-NEXT: lxv 33, 0(6) +; CHECK-NEXT: ld 2, 760(1) +; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill +; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: ld 27, 816(1) +; CHECK-NEXT: ld 26, 808(1) ; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill ; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 15, 736(1) +; CHECK-NEXT: lxv 39, 0(8) ; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill ; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 30, 704(1) +; CHECK-NEXT: lxv 38, 0(9) ; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill ; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 21, 784(1) ; CHECK-NEXT: ld 20, 776(1) -; CHECK-NEXT: lxv 10, 0(19) -; CHECK-NEXT: lxv 7, 0(21) -; CHECK-NEXT: ld 15, 736(1) -; CHECK-NEXT: ld 29, 704(1) -; CHECK-NEXT: ld 30, 720(1) -; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill +; CHECK-NEXT: iselgt 3, 3, 6 +; CHECK-NEXT: ld 6, 720(1) +; CHECK-NEXT: ld 24, 792(1) +; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: lxv 6, 0(19) +; CHECK-NEXT: lxv 11, 0(7) +; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 840(1) +; CHECK-NEXT: lxv 12, 0(6) +; CHECK-NEXT: rldicl 12, 3, 61, 3 ; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 42, 0(9) -; CHECK-NEXT: lxv 37, 0(7) ; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill ; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 43, 0(8) -; CHECK-NEXT: lxv 41, 0(10) +; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 4, 0(21) +; CHECK-NEXT: ld 25, 800(1) +; CHECK-NEXT: lxv 33, 0(10) +; CHECK-NEXT: lxv 32, 0(23) +; CHECK-NEXT: lxv 36, 0(30) ; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 17, 752(1) ; CHECK-NEXT: ld 16, 744(1) -; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 25, 800(1) -; CHECK-NEXT: ld 24, 792(1) -; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 27, 816(1) -; CHECK-NEXT: ld 26, 808(1) -; CHECK-NEXT: std 8, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: std 9, 56(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 40, 0(23) -; CHECK-NEXT: lxv 38, 0(29) -; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 32, 0(30) -; CHECK-NEXT: lxv 36, 0(15) +; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 29, 712(1) +; CHECK-NEXT: ld 28, 696(1) +; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 37, 0(28) +; CHECK-NEXT: lxv 13, 0(29) ; CHECK-NEXT: mr 8, 29 -; CHECK-NEXT: mr 10, 30 +; CHECK-NEXT: mr 9, 30 +; CHECK-NEXT: mr 10, 28 +; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill ; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 13, 0(16) -; CHECK-NEXT: lxv 12, 0(17) +; CHECK-NEXT: lxv 10, 0(15) +; CHECK-NEXT: lxv 9, 0(16) +; CHECK-NEXT: li 28, 1 ; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 11, 0(2) -; CHECK-NEXT: lxv 9, 0(20) +; CHECK-NEXT: lxv 8, 0(17) +; CHECK-NEXT: lxv 7, 0(2) ; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 5, 0(24) -; CHECK-NEXT: lxv 4, 0(25) +; CHECK-NEXT: lxv 5, 0(20) +; CHECK-NEXT: lxv 3, 0(24) ; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 2, 0(26) -; CHECK-NEXT: lxv 0, 0(27) -; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: lxv 2, 0(25) +; CHECK-NEXT: lxv 1, 0(26) ; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill -; CHECK-NEXT: lxv 1, 0(28) +; CHECK-NEXT: lxv 0, 0(27) ; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill +; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 832(1) ; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 62, 368(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill -; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 832(1) +; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill ; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill ; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: std 28, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 696(1) -; CHECK-NEXT: li 28, 1 -; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 3, 0(7) -; CHECK-NEXT: std 5, 32(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 824(1) +; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill ; CHECK-NEXT: lwa 5, 0(11) -; CHECK-NEXT: li 11, 9 -; CHECK-NEXT: ld 9, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: iselgt 3, 3, 11 -; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload ; CHECK-NEXT: mulli 6, 5, 40 ; CHECK-NEXT: sldi 0, 5, 4 ; CHECK-NEXT: extswsli 14, 5, 3 -; CHECK-NEXT: rldicl 12, 3, 61, 3 -; CHECK-NEXT: lxv 39, 0(9) +; CHECK-NEXT: lxv 40, 0(7) +; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload ; CHECK-NEXT: add 31, 14, 22 ; CHECK-NEXT: add 11, 0, 22 ; CHECK-NEXT: mr 26, 22 @@ -186,12 +185,13 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: add 19, 22, 6 ; CHECK-NEXT: sldi 6, 5, 5 ; CHECK-NEXT: mulli 5, 5, 24 +; CHECK-NEXT: lxv 41, 0(7) ; CHECK-NEXT: add 20, 22, 6 ; CHECK-NEXT: add 21, 22, 5 ; CHECK-NEXT: ld 5, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 8, 0(5) +; CHECK-NEXT: lxv 43, 0(5) ; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 6, 0(5) +; CHECK-NEXT: lxv 42, 0(5) ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 @@ -212,9 +212,9 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: lxvp 34, 0(6) ; CHECK-NEXT: lxvp 44, 0(5) -; CHECK-NEXT: xvmaddadp 43, 45, 35 +; CHECK-NEXT: xvmaddadp 39, 45, 35 ; CHECK-NEXT: lxvp 46, 0(24) -; CHECK-NEXT: xvmaddadp 42, 47, 35 +; CHECK-NEXT: xvmaddadp 38, 47, 35 ; CHECK-NEXT: lxvp 48, 0(25) ; CHECK-NEXT: lxvp 50, 0(29) ; CHECK-NEXT: lxvp 62, 0(30) @@ -226,28 +226,28 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxvp 30, 32(29) ; CHECK-NEXT: lxvp 28, 32(30) ; CHECK-NEXT: lxvp 26, 32(2) -; CHECK-NEXT: xvmaddadp 41, 49, 35 -; CHECK-NEXT: xvmaddadp 40, 51, 35 -; CHECK-NEXT: xvmaddadp 39, 63, 35 -; CHECK-NEXT: xvmaddadp 38, 61, 35 -; CHECK-NEXT: xvmaddadp 33, 44, 34 -; CHECK-NEXT: xvmaddadp 32, 46, 34 -; CHECK-NEXT: xvmaddadp 37, 48, 34 -; CHECK-NEXT: xvmaddadp 36, 50, 34 -; CHECK-NEXT: xvmaddadp 13, 62, 34 -; CHECK-NEXT: xvmaddadp 12, 60, 34 -; CHECK-NEXT: xvmaddadp 11, 57, 59 -; CHECK-NEXT: xvmaddadp 10, 55, 59 -; CHECK-NEXT: xvmaddadp 9, 53, 59 -; CHECK-NEXT: xvmaddadp 7, 31, 59 -; CHECK-NEXT: xvmaddadp 5, 29, 59 -; CHECK-NEXT: xvmaddadp 4, 27, 59 -; CHECK-NEXT: xvmaddadp 2, 56, 58 +; CHECK-NEXT: xvmaddadp 33, 49, 35 +; CHECK-NEXT: xvmaddadp 32, 51, 35 +; CHECK-NEXT: xvmaddadp 37, 63, 35 +; CHECK-NEXT: xvmaddadp 36, 61, 35 +; CHECK-NEXT: xvmaddadp 13, 44, 34 +; CHECK-NEXT: xvmaddadp 12, 46, 34 +; CHECK-NEXT: xvmaddadp 11, 48, 34 +; CHECK-NEXT: xvmaddadp 10, 50, 34 +; CHECK-NEXT: xvmaddadp 9, 62, 34 +; CHECK-NEXT: xvmaddadp 8, 60, 34 +; CHECK-NEXT: xvmaddadp 7, 57, 59 +; CHECK-NEXT: xvmaddadp 6, 55, 59 +; CHECK-NEXT: xvmaddadp 5, 53, 59 +; CHECK-NEXT: xvmaddadp 4, 31, 59 +; CHECK-NEXT: xvmaddadp 3, 29, 59 +; CHECK-NEXT: xvmaddadp 2, 27, 59 +; CHECK-NEXT: xvmaddadp 1, 56, 58 ; CHECK-NEXT: xvmaddadp 0, 54, 58 -; CHECK-NEXT: xvmaddadp 1, 52, 58 -; CHECK-NEXT: xvmaddadp 3, 30, 58 -; CHECK-NEXT: xvmaddadp 8, 28, 58 -; CHECK-NEXT: xvmaddadp 6, 26, 58 +; CHECK-NEXT: xvmaddadp 40, 52, 58 +; CHECK-NEXT: xvmaddadp 41, 30, 58 +; CHECK-NEXT: xvmaddadp 43, 28, 58 +; CHECK-NEXT: xvmaddadp 42, 26, 58 ; CHECK-NEXT: addi 6, 6, 64 ; CHECK-NEXT: addi 5, 5, 64 ; CHECK-NEXT: addi 24, 24, 64 @@ -269,10 +269,10 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: cmpld 28, 4 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit -; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload +; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 62, 368(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 61, 352(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 60, 336(1) # 16-byte Folded Reload @@ -284,8 +284,8 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxv 54, 240(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 53, 224(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 52, 208(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 42, 0(3) -; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 31, 584(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 576(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 29, 568(1) # 8-byte Folded Reload @@ -297,7 +297,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 29, 520(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 28, 512(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 27, 504(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 26, 496(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 25, 488(1) # 8-byte Folded Reload @@ -310,46 +310,46 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 18, 432(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 17, 424(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) -; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(9) -; CHECK-NEXT: stxv 38, 0(8) +; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 37, 0(10) +; CHECK-NEXT: stxv 36, 0(9) +; CHECK-NEXT: stxv 13, 0(8) ; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 12, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(10) -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 11, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: stxv 9, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: stxv 8, 0(3) ; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: stxv 7, 0(3) ; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: stxv 6, 0(3) ; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: stxv 5, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 4, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: stxv 3, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 4, 0(3) -; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 2, 0(3) +; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 1, 0(3) ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 42, 0(3) ; CHECK-NEXT: addi 1, 1, 592 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll index 3d2b6bc4da2a9..7a6640fea2d1e 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -280,37 +280,37 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-PWR9-LE-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r3, r3, 24 +; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 +; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r4, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r4, r4, v3 -; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 -; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 +; CHECK-PWR9-LE-NEXT: sub r6, r8, r5 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r4, r4, 24 -; CHECK-PWR9-LE-NEXT: sub r5, r8, r5 ; CHECK-PWR9-LE-NEXT: sub r4, r7, r4 -; CHECK-PWR9-LE-NEXT: srawi r6, r3, 31 +; CHECK-PWR9-LE-NEXT: srawi r5, r3, 31 ; CHECK-PWR9-LE-NEXT: srawi r7, r4, 31 -; CHECK-PWR9-LE-NEXT: xor r3, r3, r6 +; CHECK-PWR9-LE-NEXT: xor r3, r3, r5 ; CHECK-PWR9-LE-NEXT: xor r4, r4, r7 -; CHECK-PWR9-LE-NEXT: sub r6, r3, r6 -; CHECK-PWR9-LE-NEXT: srawi r3, r5, 31 +; CHECK-PWR9-LE-NEXT: sub r5, r3, r5 +; CHECK-PWR9-LE-NEXT: srawi r3, r6, 31 ; CHECK-PWR9-LE-NEXT: sub r4, r4, r7 -; CHECK-PWR9-LE-NEXT: xor r5, r5, r3 -; CHECK-PWR9-LE-NEXT: sub r3, r5, r3 -; CHECK-PWR9-LE-NEXT: li r5, 3 -; CHECK-PWR9-LE-NEXT: vextubrx r7, r5, v2 -; CHECK-PWR9-LE-NEXT: vextubrx r5, r5, v3 +; CHECK-PWR9-LE-NEXT: xor r6, r6, r3 +; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 +; CHECK-PWR9-LE-NEXT: li r6, 3 +; CHECK-PWR9-LE-NEXT: vextubrx r7, r6, v2 +; CHECK-PWR9-LE-NEXT: vextubrx r6, r6, v3 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR9-LE-NEXT: sub r5, r7, r5 -; CHECK-PWR9-LE-NEXT: srawi r7, r5, 31 -; CHECK-PWR9-LE-NEXT: xor r5, r5, r7 -; CHECK-PWR9-LE-NEXT: sub r5, r5, r7 +; CHECK-PWR9-LE-NEXT: clrlwi r6, r6, 24 +; CHECK-PWR9-LE-NEXT: sub r6, r7, r6 +; CHECK-PWR9-LE-NEXT: srawi r7, r6, 31 +; CHECK-PWR9-LE-NEXT: xor r6, r6, r7 +; CHECK-PWR9-LE-NEXT: sub r6, r6, r7 ; CHECK-PWR9-LE-NEXT: li r7, 4 ; CHECK-PWR9-LE-NEXT: vextubrx r8, r7, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r7, v3 -; CHECK-PWR9-LE-NEXT: mtvsrd v4, r5 +; CHECK-PWR9-LE-NEXT: mtvsrd v4, r6 ; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 ; CHECK-PWR9-LE-NEXT: sub r7, r8, r7 @@ -411,7 +411,7 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: li r26, 15 ; CHECK-PWR9-LE-NEXT: vextubrx r25, r26, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r26, r26, v3 -; CHECK-PWR9-LE-NEXT: mtvsrd v2, r6 +; CHECK-PWR9-LE-NEXT: mtvsrd v2, r5 ; CHECK-PWR9-LE-NEXT: mtvsrd v3, r4 ; CHECK-PWR9-LE-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR9-LE-NEXT: mtvsrd v3, r3 @@ -652,87 +652,95 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8: # %bb.0: # %entry ; CHECK-PWR8-NEXT: xxswapd vs0, v2 ; CHECK-PWR8-NEXT: xxswapd vs1, v3 +; CHECK-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-PWR8-NEXT: mffprd r11, f0 +; CHECK-PWR8-NEXT: mffprd r8, f1 ; CHECK-PWR8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: mffprd r5, f0 -; CHECK-PWR8-NEXT: mffprd r11, f1 ; CHECK-PWR8-NEXT: std r25, -56(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: clrldi r3, r5, 56 -; CHECK-PWR8-NEXT: clrldi r4, r11, 56 -; CHECK-PWR8-NEXT: rldicl r6, r5, 56, 56 -; CHECK-PWR8-NEXT: rldicl r7, r11, 56, 56 -; CHECK-PWR8-NEXT: rldicl r10, r5, 40, 56 -; CHECK-PWR8-NEXT: rldicl r12, r11, 40, 56 -; CHECK-PWR8-NEXT: rldicl r8, r5, 48, 56 -; CHECK-PWR8-NEXT: rldicl r9, r11, 48, 56 -; CHECK-PWR8-NEXT: rldicl r29, r5, 24, 56 -; CHECK-PWR8-NEXT: rldicl r28, r11, 24, 56 -; CHECK-PWR8-NEXT: rldicl r27, r5, 16, 56 -; CHECK-PWR8-NEXT: rldicl r0, r5, 32, 56 -; CHECK-PWR8-NEXT: rldicl r30, r11, 32, 56 -; CHECK-PWR8-NEXT: rldicl r5, r5, 8, 56 +; CHECK-PWR8-NEXT: clrldi r3, r11, 56 +; CHECK-PWR8-NEXT: clrldi r4, r8, 56 +; CHECK-PWR8-NEXT: rldicl r5, r11, 56, 56 +; CHECK-PWR8-NEXT: rldicl r6, r8, 56, 56 +; CHECK-PWR8-NEXT: rldicl r7, r11, 48, 56 +; CHECK-PWR8-NEXT: rldicl r9, r8, 48, 56 +; CHECK-PWR8-NEXT: rldicl r0, r11, 32, 56 +; CHECK-PWR8-NEXT: rldicl r30, r8, 32, 56 +; CHECK-PWR8-NEXT: rldicl r29, r11, 24, 56 +; CHECK-PWR8-NEXT: rldicl r28, r8, 24, 56 +; CHECK-PWR8-NEXT: rldicl r10, r11, 40, 56 +; CHECK-PWR8-NEXT: rldicl r12, r8, 40, 56 +; CHECK-PWR8-NEXT: rldicl r27, r11, 16, 56 +; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: std r24, -64(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: clrlwi r3, r3, 24 ; CHECK-PWR8-NEXT: clrlwi r4, r4, 24 +; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR8-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR8-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR8-NEXT: clrlwi r10, r10, 24 -; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 -; CHECK-PWR8-NEXT: sub r3, r3, r4 -; CHECK-PWR8-NEXT: sub r4, r6, r7 -; CHECK-PWR8-NEXT: sub r7, r10, r12 -; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR8-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR8-NEXT: sub r3, r3, r4 +; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 +; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 +; CHECK-PWR8-NEXT: sub r4, r5, r6 +; CHECK-PWR8-NEXT: sub r5, r7, r9 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 -; CHECK-PWR8-NEXT: sub r6, r8, r9 +; CHECK-PWR8-NEXT: sub r7, r0, r30 ; CHECK-PWR8-NEXT: sub r9, r29, r28 +; CHECK-PWR8-NEXT: clrlwi r10, r10, 24 +; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: sub r6, r10, r12 ; CHECK-PWR8-NEXT: clrlwi r27, r27, 24 -; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 -; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR8-NEXT: sub r8, r0, r30 -; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR8-NEXT: srawi r10, r3, 31 +; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR8-NEXT: srawi r0, r5, 31 +; CHECK-PWR8-NEXT: srawi r29, r7, 31 ; CHECK-PWR8-NEXT: srawi r12, r4, 31 ; CHECK-PWR8-NEXT: srawi r28, r9, 31 -; CHECK-PWR8-NEXT: srawi r0, r6, 31 -; CHECK-PWR8-NEXT: srawi r29, r8, 31 -; CHECK-PWR8-NEXT: srawi r30, r7, 31 -; CHECK-PWR8-NEXT: xor r3, r3, r10 -; CHECK-PWR8-NEXT: sub r10, r3, r10 -; CHECK-PWR8-NEXT: rldicl r3, r11, 16, 56 +; CHECK-PWR8-NEXT: srawi r30, r6, 31 +; CHECK-PWR8-NEXT: srawi r10, r3, 31 +; CHECK-PWR8-NEXT: xor r5, r5, r0 +; CHECK-PWR8-NEXT: xor r26, r7, r29 +; CHECK-PWR8-NEXT: sub r7, r5, r0 +; CHECK-PWR8-NEXT: rldicl r5, r8, 16, 56 +; CHECK-PWR8-NEXT: rldicl r8, r8, 8, 56 ; CHECK-PWR8-NEXT: xor r4, r4, r12 -; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: xor r25, r9, r28 ; CHECK-PWR8-NEXT: sub r9, r4, r12 -; CHECK-PWR8-NEXT: sub r4, r25, r28 +; CHECK-PWR8-NEXT: sub r4, r26, r29 ; CHECK-PWR8-NEXT: mtvsrd v1, r9 -; CHECK-PWR8-NEXT: clrlwi r3, r3, 24 -; CHECK-PWR8-NEXT: mtvsrd v7, r4 -; CHECK-PWR8-NEXT: sub r3, r27, r3 -; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR8-NEXT: xor r6, r6, r0 -; CHECK-PWR8-NEXT: sub r5, r5, r11 -; CHECK-PWR8-NEXT: xor r26, r8, r29 -; CHECK-PWR8-NEXT: sub r8, r6, r0 -; CHECK-PWR8-NEXT: mfvsrd r0, v3 -; CHECK-PWR8-NEXT: xor r7, r7, r30 -; CHECK-PWR8-NEXT: sub r7, r7, r30 -; CHECK-PWR8-NEXT: sub r6, r26, r29 -; CHECK-PWR8-NEXT: mtvsrd v6, r7 -; CHECK-PWR8-NEXT: clrldi r30, r0, 56 -; CHECK-PWR8-NEXT: rldicl r29, r0, 56, 56 -; CHECK-PWR8-NEXT: rldicl r28, r0, 48, 56 -; CHECK-PWR8-NEXT: rldicl r27, r0, 40, 56 -; CHECK-PWR8-NEXT: rldicl r26, r0, 32, 56 -; CHECK-PWR8-NEXT: rldicl r25, r0, 24, 56 -; CHECK-PWR8-NEXT: rldicl r24, r0, 16, 56 -; CHECK-PWR8-NEXT: rldicl r0, r0, 8, 56 -; CHECK-PWR8-NEXT: srawi r12, r3, 31 -; CHECK-PWR8-NEXT: srawi r11, r5, 31 +; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 +; CHECK-PWR8-NEXT: sub r5, r27, r5 +; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 +; CHECK-PWR8-NEXT: sub r8, r11, r8 +; CHECK-PWR8-NEXT: xor r6, r6, r30 +; CHECK-PWR8-NEXT: sub r6, r6, r30 +; CHECK-PWR8-NEXT: xor r3, r3, r10 +; CHECK-PWR8-NEXT: sub r10, r3, r10 +; CHECK-PWR8-NEXT: sub r3, r25, r28 +; CHECK-PWR8-NEXT: mtvsrd v6, r6 +; CHECK-PWR8-NEXT: mtvsrd v7, r3 +; CHECK-PWR8-NEXT: srawi r12, r5, 31 +; CHECK-PWR8-NEXT: srawi r11, r8, 31 +; CHECK-PWR8-NEXT: xor r5, r5, r12 +; CHECK-PWR8-NEXT: xor r8, r8, r11 +; CHECK-PWR8-NEXT: sub r5, r5, r12 +; CHECK-PWR8-NEXT: sub r8, r8, r11 +; CHECK-PWR8-NEXT: mfvsrd r11, v2 +; CHECK-PWR8-NEXT: mfvsrd r12, v3 +; CHECK-PWR8-NEXT: mtvsrd v8, r8 +; CHECK-PWR8-NEXT: clrldi r0, r11, 56 +; CHECK-PWR8-NEXT: clrldi r30, r12, 56 +; CHECK-PWR8-NEXT: rldicl r29, r12, 56, 56 +; CHECK-PWR8-NEXT: rldicl r28, r12, 48, 56 +; CHECK-PWR8-NEXT: rldicl r27, r12, 40, 56 +; CHECK-PWR8-NEXT: rldicl r26, r12, 32, 56 +; CHECK-PWR8-NEXT: rldicl r25, r12, 24, 56 +; CHECK-PWR8-NEXT: rldicl r24, r12, 16, 56 +; CHECK-PWR8-NEXT: rldicl r12, r12, 8, 56 +; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 @@ -740,27 +748,19 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR8-NEXT: clrlwi r25, r25, 24 ; CHECK-PWR8-NEXT: clrlwi r24, r24, 24 -; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 -; CHECK-PWR8-NEXT: xor r3, r3, r12 -; CHECK-PWR8-NEXT: sub r3, r3, r12 -; CHECK-PWR8-NEXT: mfvsrd r12, v2 -; CHECK-PWR8-NEXT: xor r5, r5, r11 -; CHECK-PWR8-NEXT: sub r5, r5, r11 -; CHECK-PWR8-NEXT: mtvsrd v8, r5 -; CHECK-PWR8-NEXT: clrldi r11, r12, 56 -; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR8-NEXT: sub r11, r11, r30 -; CHECK-PWR8-NEXT: srawi r30, r11, 31 -; CHECK-PWR8-NEXT: xor r11, r11, r30 -; CHECK-PWR8-NEXT: sub r11, r11, r30 -; CHECK-PWR8-NEXT: rldicl r30, r12, 56, 56 +; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: sub r0, r0, r30 +; CHECK-PWR8-NEXT: srawi r30, r0, 31 +; CHECK-PWR8-NEXT: xor r0, r0, r30 +; CHECK-PWR8-NEXT: sub r0, r0, r30 +; CHECK-PWR8-NEXT: rldicl r30, r11, 56, 56 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR8-NEXT: mtvsrd v2, r11 +; CHECK-PWR8-NEXT: mtvsrd v2, r0 ; CHECK-PWR8-NEXT: sub r30, r30, r29 ; CHECK-PWR8-NEXT: srawi r29, r30, 31 ; CHECK-PWR8-NEXT: xor r30, r30, r29 ; CHECK-PWR8-NEXT: sub r30, r30, r29 -; CHECK-PWR8-NEXT: rldicl r29, r12, 48, 56 +; CHECK-PWR8-NEXT: rldicl r29, r11, 48, 56 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: mtvsrd v3, r30 ; CHECK-PWR8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -768,13 +768,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: srawi r28, r29, 31 ; CHECK-PWR8-NEXT: xor r29, r29, r28 ; CHECK-PWR8-NEXT: sub r29, r29, r28 -; CHECK-PWR8-NEXT: rldicl r28, r12, 40, 56 +; CHECK-PWR8-NEXT: rldicl r28, r11, 40, 56 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 ; CHECK-PWR8-NEXT: sub r28, r28, r27 ; CHECK-PWR8-NEXT: srawi r27, r28, 31 ; CHECK-PWR8-NEXT: xor r28, r28, r27 ; CHECK-PWR8-NEXT: sub r28, r28, r27 -; CHECK-PWR8-NEXT: rldicl r27, r12, 32, 56 +; CHECK-PWR8-NEXT: rldicl r27, r11, 32, 56 ; CHECK-PWR8-NEXT: clrlwi r27, r27, 24 ; CHECK-PWR8-NEXT: mtvsrd v4, r28 ; CHECK-PWR8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload @@ -782,28 +782,28 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: srawi r26, r27, 31 ; CHECK-PWR8-NEXT: xor r27, r27, r26 ; CHECK-PWR8-NEXT: sub r27, r27, r26 -; CHECK-PWR8-NEXT: rldicl r26, r12, 24, 56 +; CHECK-PWR8-NEXT: rldicl r26, r11, 24, 56 ; CHECK-PWR8-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR8-NEXT: sub r26, r26, r25 ; CHECK-PWR8-NEXT: srawi r25, r26, 31 ; CHECK-PWR8-NEXT: xor r26, r26, r25 ; CHECK-PWR8-NEXT: sub r26, r26, r25 -; CHECK-PWR8-NEXT: rldicl r25, r12, 16, 56 -; CHECK-PWR8-NEXT: rldicl r12, r12, 8, 56 +; CHECK-PWR8-NEXT: rldicl r25, r11, 16, 56 +; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: clrlwi r25, r25, 24 -; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 ; CHECK-PWR8-NEXT: mtvsrd v5, r26 ; CHECK-PWR8-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r12, r12, r0 +; CHECK-PWR8-NEXT: sub r11, r11, r12 ; CHECK-PWR8-NEXT: srawi r24, r25, 31 -; CHECK-PWR8-NEXT: srawi r0, r12, 31 +; CHECK-PWR8-NEXT: srawi r12, r11, 31 ; CHECK-PWR8-NEXT: xor r25, r25, r24 -; CHECK-PWR8-NEXT: xor r12, r12, r0 +; CHECK-PWR8-NEXT: xor r11, r11, r12 ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r12, r12, r0 +; CHECK-PWR8-NEXT: sub r11, r11, r12 ; CHECK-PWR8-NEXT: ld r24, -64(r1) # 8-byte Folded Reload -; CHECK-PWR8-NEXT: mtvsrd v0, r12 +; CHECK-PWR8-NEXT: mtvsrd v0, r11 ; CHECK-PWR8-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR8-NEXT: mtvsrd v3, r29 ; CHECK-PWR8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload @@ -819,12 +819,12 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: vmrglh v3, v5, v4 ; CHECK-PWR8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-PWR8-NEXT: vmrghb v0, v1, v0 -; CHECK-PWR8-NEXT: mtvsrd v1, r8 +; CHECK-PWR8-NEXT: mtvsrd v1, r7 ; CHECK-PWR8-NEXT: vmrghb v1, v6, v1 -; CHECK-PWR8-NEXT: mtvsrd v6, r6 +; CHECK-PWR8-NEXT: mtvsrd v6, r4 ; CHECK-PWR8-NEXT: vmrglh v4, v1, v0 ; CHECK-PWR8-NEXT: vmrghb v6, v7, v6 -; CHECK-PWR8-NEXT: mtvsrd v7, r3 +; CHECK-PWR8-NEXT: mtvsrd v7, r5 ; CHECK-PWR8-NEXT: vmrghb v7, v8, v7 ; CHECK-PWR8-NEXT: vmrglh v5, v7, v6 ; CHECK-PWR8-NEXT: xxmrglw vs1, v5, v4 @@ -854,85 +854,85 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR7-NEXT: addi r3, r1, 320 ; CHECK-PWR7-NEXT: lbz r4, 304(r1) ; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: lbz r3, 320(r1) ; CHECK-PWR7-NEXT: lbz r5, 305(r1) ; CHECK-PWR7-NEXT: lbz r6, 321(r1) ; CHECK-PWR7-NEXT: lbz r7, 306(r1) ; CHECK-PWR7-NEXT: lbz r8, 322(r1) -; CHECK-PWR7-NEXT: lbz r0, 309(r1) -; CHECK-PWR7-NEXT: lbz r30, 325(r1) ; CHECK-PWR7-NEXT: lbz r9, 307(r1) ; CHECK-PWR7-NEXT: lbz r10, 323(r1) +; CHECK-PWR7-NEXT: lbz r0, 309(r1) +; CHECK-PWR7-NEXT: lbz r30, 325(r1) ; CHECK-PWR7-NEXT: lbz r29, 310(r1) ; CHECK-PWR7-NEXT: lbz r28, 326(r1) ; CHECK-PWR7-NEXT: lbz r11, 308(r1) ; CHECK-PWR7-NEXT: lbz r12, 324(r1) ; CHECK-PWR7-NEXT: lbz r27, 311(r1) ; CHECK-PWR7-NEXT: lbz r26, 327(r1) -; CHECK-PWR7-NEXT: sub r3, r4, r3 -; CHECK-PWR7-NEXT: sub r4, r5, r6 -; CHECK-PWR7-NEXT: sub r5, r7, r8 -; CHECK-PWR7-NEXT: sub r8, r0, r30 -; CHECK-PWR7-NEXT: sub r6, r9, r10 -; CHECK-PWR7-NEXT: sub r9, r29, r28 -; CHECK-PWR7-NEXT: srawi r0, r4, 31 -; CHECK-PWR7-NEXT: srawi r30, r5, 31 -; CHECK-PWR7-NEXT: srawi r29, r6, 31 -; CHECK-PWR7-NEXT: sub r7, r11, r12 -; CHECK-PWR7-NEXT: srawi r28, r7, 31 -; CHECK-PWR7-NEXT: sub r10, r27, r26 -; CHECK-PWR7-NEXT: srawi r27, r8, 31 -; CHECK-PWR7-NEXT: xor r4, r4, r0 -; CHECK-PWR7-NEXT: xor r5, r5, r30 -; CHECK-PWR7-NEXT: xor r6, r6, r29 -; CHECK-PWR7-NEXT: xor r7, r7, r28 -; CHECK-PWR7-NEXT: xor r8, r8, r27 -; CHECK-PWR7-NEXT: srawi r26, r9, 31 -; CHECK-PWR7-NEXT: sub r4, r4, r0 -; CHECK-PWR7-NEXT: sub r5, r5, r30 +; CHECK-PWR7-NEXT: lbz r25, 312(r1) +; CHECK-PWR7-NEXT: sub r5, r5, r6 +; CHECK-PWR7-NEXT: sub r6, r7, r8 +; CHECK-PWR7-NEXT: sub r7, r9, r10 +; CHECK-PWR7-NEXT: sub r9, r0, r30 +; CHECK-PWR7-NEXT: sub r10, r29, r28 +; CHECK-PWR7-NEXT: sub r8, r11, r12 +; CHECK-PWR7-NEXT: srawi r0, r5, 31 +; CHECK-PWR7-NEXT: srawi r30, r6, 31 +; CHECK-PWR7-NEXT: srawi r29, r7, 31 +; CHECK-PWR7-NEXT: srawi r28, r8, 31 +; CHECK-PWR7-NEXT: sub r11, r27, r26 +; CHECK-PWR7-NEXT: srawi r27, r9, 31 +; CHECK-PWR7-NEXT: lbz r24, 328(r1) +; CHECK-PWR7-NEXT: xor r5, r5, r0 +; CHECK-PWR7-NEXT: xor r6, r6, r30 +; CHECK-PWR7-NEXT: xor r7, r7, r29 +; CHECK-PWR7-NEXT: xor r8, r8, r28 +; CHECK-PWR7-NEXT: xor r9, r9, r27 +; CHECK-PWR7-NEXT: srawi r26, r10, 31 +; CHECK-PWR7-NEXT: sub r5, r5, r0 +; CHECK-PWR7-NEXT: sub r6, r6, r30 ; CHECK-PWR7-NEXT: lbz r0, 313(r1) ; CHECK-PWR7-NEXT: lbz r30, 329(r1) -; CHECK-PWR7-NEXT: sub r6, r6, r29 +; CHECK-PWR7-NEXT: sub r7, r7, r29 ; CHECK-PWR7-NEXT: lbz r29, 330(r1) -; CHECK-PWR7-NEXT: sub r7, r7, r28 +; CHECK-PWR7-NEXT: sub r8, r8, r28 ; CHECK-PWR7-NEXT: lbz r28, 331(r1) -; CHECK-PWR7-NEXT: sub r8, r8, r27 +; CHECK-PWR7-NEXT: sub r9, r9, r27 ; CHECK-PWR7-NEXT: lbz r27, 332(r1) -; CHECK-PWR7-NEXT: xor r9, r9, r26 -; CHECK-PWR7-NEXT: sub r9, r9, r26 +; CHECK-PWR7-NEXT: xor r10, r10, r26 +; CHECK-PWR7-NEXT: sub r10, r10, r26 ; CHECK-PWR7-NEXT: lbz r26, 333(r1) -; CHECK-PWR7-NEXT: lbz r25, 312(r1) -; CHECK-PWR7-NEXT: lbz r24, 328(r1) -; CHECK-PWR7-NEXT: sub r11, r25, r24 +; CHECK-PWR7-NEXT: sub r12, r25, r24 +; CHECK-PWR7-NEXT: srawi r25, r11, 31 +; CHECK-PWR7-NEXT: lbz r3, 320(r1) ; CHECK-PWR7-NEXT: sub r0, r0, r30 -; CHECK-PWR7-NEXT: srawi r25, r10, 31 -; CHECK-PWR7-NEXT: xor r10, r10, r25 -; CHECK-PWR7-NEXT: sub r10, r10, r25 +; CHECK-PWR7-NEXT: xor r11, r11, r25 +; CHECK-PWR7-NEXT: sub r11, r11, r25 ; CHECK-PWR7-NEXT: lbz r25, 334(r1) +; CHECK-PWR7-NEXT: sub r4, r4, r3 ; CHECK-PWR7-NEXT: srawi r30, r0, 31 -; CHECK-PWR7-NEXT: srawi r24, r11, 31 -; CHECK-PWR7-NEXT: xor r11, r11, r24 -; CHECK-PWR7-NEXT: sub r11, r11, r24 +; CHECK-PWR7-NEXT: srawi r24, r12, 31 +; CHECK-PWR7-NEXT: xor r12, r12, r24 +; CHECK-PWR7-NEXT: sub r12, r12, r24 ; CHECK-PWR7-NEXT: lbz r24, 335(r1) -; CHECK-PWR7-NEXT: srawi r12, r3, 31 -; CHECK-PWR7-NEXT: xor r3, r3, r12 +; CHECK-PWR7-NEXT: srawi r3, r4, 31 +; CHECK-PWR7-NEXT: xor r4, r4, r3 ; CHECK-PWR7-NEXT: xor r0, r0, r30 -; CHECK-PWR7-NEXT: sub r3, r3, r12 +; CHECK-PWR7-NEXT: sub r3, r4, r3 ; CHECK-PWR7-NEXT: stb r3, 48(r1) ; CHECK-PWR7-NEXT: addi r3, r1, 288 -; CHECK-PWR7-NEXT: stb r11, 176(r1) +; CHECK-PWR7-NEXT: stb r12, 176(r1) ; CHECK-PWR7-NEXT: sub r0, r0, r30 ; CHECK-PWR7-NEXT: lbz r30, 314(r1) -; CHECK-PWR7-NEXT: stb r10, 160(r1) +; CHECK-PWR7-NEXT: stb r11, 160(r1) ; CHECK-PWR7-NEXT: sub r30, r30, r29 ; CHECK-PWR7-NEXT: stb r0, 192(r1) -; CHECK-PWR7-NEXT: stb r9, 144(r1) -; CHECK-PWR7-NEXT: stb r8, 128(r1) -; CHECK-PWR7-NEXT: stb r7, 112(r1) -; CHECK-PWR7-NEXT: stb r6, 96(r1) -; CHECK-PWR7-NEXT: stb r5, 80(r1) +; CHECK-PWR7-NEXT: stb r10, 144(r1) +; CHECK-PWR7-NEXT: stb r9, 128(r1) +; CHECK-PWR7-NEXT: stb r8, 112(r1) +; CHECK-PWR7-NEXT: stb r7, 96(r1) +; CHECK-PWR7-NEXT: stb r6, 80(r1) ; CHECK-PWR7-NEXT: srawi r29, r30, 31 -; CHECK-PWR7-NEXT: stb r4, 64(r1) +; CHECK-PWR7-NEXT: stb r5, 64(r1) ; CHECK-PWR7-NEXT: xor r30, r30, r29 ; CHECK-PWR7-NEXT: sub r30, r30, r29 ; CHECK-PWR7-NEXT: lbz r29, 315(r1) diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index 9fea8e5f8ab47..f699ea54192d8 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -756,7 +756,7 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: xxswapd 1, 32 ; CHECK-NEXT: xxswapd 6, 42 ; CHECK-NEXT: mffprd 5, 1 -; CHECK-NEXT: cmpld 5, 3 +; CHECK-NEXT: cmpld 6, 5, 3 ; CHECK-NEXT: mffprd 7, 6 ; CHECK-NEXT: xxswapd 3, 33 ; CHECK-NEXT: xxswapd 7, 43 @@ -765,9 +765,9 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: mffprd 6, 5 ; CHECK-NEXT: mffprd 7, 7 ; CHECK-NEXT: mfvsrd 5, 36 -; CHECK-NEXT: cmpld 1, 3, 4 +; CHECK-NEXT: cmpld 3, 4 ; CHECK-NEXT: mfvsrd 3, 34 -; CHECK-NEXT: cmpld 6, 7, 6 +; CHECK-NEXT: cmpld 1, 7, 6 ; CHECK-NEXT: mfvsrd 7, 32 ; CHECK-NEXT: mfvsrd 4, 35 ; CHECK-NEXT: mfvsrd 6, 37 @@ -775,34 +775,34 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: cmpd 2, 7, 3 ; CHECK-NEXT: mfvsrd 3, 33 ; CHECK-NEXT: crandc 21, 8, 30 -; CHECK-NEXT: crand 22, 30, 0 -; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: crand 22, 30, 24 +; CHECK-NEXT: cmpld 6, 3, 4 ; CHECK-NEXT: cmpd 7, 3, 4 ; CHECK-NEXT: mfvsrd 4, 42 ; CHECK-NEXT: sradi 3, 3, 63 ; CHECK-NEXT: mtocrf 32, 12 ; CHECK-NEXT: crnor 21, 22, 21 -; CHECK-NEXT: crandc 23, 28, 2 -; CHECK-NEXT: crand 25, 2, 4 +; CHECK-NEXT: crandc 23, 28, 26 +; CHECK-NEXT: crand 24, 26, 0 ; CHECK-NEXT: cmpld 4, 5 -; CHECK-NEXT: cmpd 1, 4, 5 +; CHECK-NEXT: cmpd 7, 4, 5 ; CHECK-NEXT: mfvsrd 5, 43 -; CHECK-NEXT: crnor 22, 25, 23 +; CHECK-NEXT: crnor 22, 24, 23 ; CHECK-NEXT: mtfprd 5, 3 ; CHECK-NEXT: sradi 4, 4, 63 ; CHECK-NEXT: mtfprd 6, 4 -; CHECK-NEXT: crandc 26, 4, 2 +; CHECK-NEXT: crandc 25, 28, 2 ; CHECK-NEXT: crand 20, 2, 20 ; CHECK-NEXT: cmpld 5, 6 -; CHECK-NEXT: cmpd 1, 5, 6 +; CHECK-NEXT: cmpd 7, 5, 6 ; CHECK-NEXT: mfvsrd 6, 38 ; CHECK-NEXT: sradi 5, 5, 63 -; CHECK-NEXT: crnor 20, 20, 26 +; CHECK-NEXT: crnor 20, 20, 25 ; CHECK-NEXT: mtfprd 7, 5 ; CHECK-NEXT: sradi 6, 6, 63 -; CHECK-NEXT: crandc 27, 4, 2 -; CHECK-NEXT: crand 24, 2, 24 -; CHECK-NEXT: crnor 23, 24, 27 +; CHECK-NEXT: crandc 26, 28, 2 +; CHECK-NEXT: crand 27, 2, 4 +; CHECK-NEXT: crnor 23, 27, 26 ; CHECK-NEXT: mtfprd 0, 6 ; CHECK-NEXT: mfvsrd 6, 39 ; CHECK-NEXT: sradi 6, 6, 63 diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll index a263b56ce70ce..df55b92997765 100644 --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -647,27 +647,27 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r4, -21386 -; P8BE-NEXT: ori r4, r4, 37253 -; P8BE-NEXT: clrldi r5, r3, 48 -; P8BE-NEXT: rldicl r6, r3, 48, 48 -; P8BE-NEXT: rldicl r7, r3, 32, 48 -; P8BE-NEXT: rldicl r3, r3, 16, 48 -; P8BE-NEXT: extsh r8, r5 +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r5, -21386 +; P8BE-NEXT: ori r5, r5, 37253 +; P8BE-NEXT: clrldi r3, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r8, r3 ; P8BE-NEXT: extsh r9, r6 ; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: mulhw r11, r8, r4 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: mulhw r11, r8, r5 ; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: mulhw r11, r9, r4 +; P8BE-NEXT: mulhw r11, r9, r5 ; P8BE-NEXT: add r9, r11, r9 -; P8BE-NEXT: mulhw r11, r10, r4 -; P8BE-NEXT: mulhw r4, r3, r4 +; P8BE-NEXT: mulhw r11, r10, r5 +; P8BE-NEXT: mulhw r5, r4, r5 ; P8BE-NEXT: add r10, r11, r10 ; P8BE-NEXT: srwi r11, r8, 31 ; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: add r4, r4, r3 +; P8BE-NEXT: add r5, r5, r4 ; P8BE-NEXT: add r8, r8, r11 ; P8BE-NEXT: srwi r11, r9, 31 ; P8BE-NEXT: srawi r9, r9, 6 @@ -676,30 +676,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P8BE-NEXT: srawi r10, r10, 6 ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r4, 31 -; P8BE-NEXT: srawi r4, r4, 6 +; P8BE-NEXT: srwi r11, r5, 31 +; P8BE-NEXT: srawi r5, r5, 6 ; P8BE-NEXT: mtvsrwz v4, r9 -; P8BE-NEXT: add r4, r4, r11 +; P8BE-NEXT: add r5, r5, r11 ; P8BE-NEXT: mulli r11, r8, 95 -; P8BE-NEXT: sub r5, r5, r11 +; P8BE-NEXT: sub r3, r3, r11 ; P8BE-NEXT: mulli r11, r9, 95 -; P8BE-NEXT: mtvsrwz v5, r5 +; P8BE-NEXT: mtvsrwz v5, r3 ; P8BE-NEXT: sub r6, r6, r11 ; P8BE-NEXT: mulli r11, r10, 95 ; P8BE-NEXT: mtvsrwz v0, r6 ; P8BE-NEXT: sub r7, r7, r11 -; P8BE-NEXT: mulli r11, r4, 95 +; P8BE-NEXT: mulli r11, r5, 95 ; P8BE-NEXT: mtvsrwz v1, r7 -; P8BE-NEXT: sub r3, r3, r11 +; P8BE-NEXT: sub r4, r4, r11 ; P8BE-NEXT: addis r11, r2, .LCPI2_0@toc@ha ; P8BE-NEXT: addi r11, r11, .LCPI2_0@toc@l ; P8BE-NEXT: lxvw4x v2, 0, r11 ; P8BE-NEXT: vperm v5, v0, v5, v2 -; P8BE-NEXT: mtvsrwz v0, r3 +; P8BE-NEXT: mtvsrwz v0, r4 ; P8BE-NEXT: vperm v3, v4, v3, v2 ; P8BE-NEXT: mtvsrwz v4, r10 ; P8BE-NEXT: vperm v0, v0, v1, v2 -; P8BE-NEXT: mtvsrwz v1, r4 +; P8BE-NEXT: mtvsrwz v1, r5 ; P8BE-NEXT: vperm v2, v1, v4, v2 ; P8BE-NEXT: xxmrghw v4, v0, v5 ; P8BE-NEXT: xxmrghw v2, v2, v3 diff --git a/llvm/test/CodeGen/PowerPC/sub-of-not.ll b/llvm/test/CodeGen/PowerPC/sub-of-not.ll index d2b55aaf7ac83..9cd2ec5510886 100644 --- a/llvm/test/CodeGen/PowerPC/sub-of-not.ll +++ b/llvm/test/CodeGen/PowerPC/sub-of-not.ll @@ -65,88 +65,89 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 7, 21, 7 -; PPC32-NEXT: lbz 23, 115(1) +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: add 5, 23, 5 -; PPC32-NEXT: lbz 23, 127(1) -; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 21, 123(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: add 5, 22, 6 ; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 10, 21, 10 -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: add 8, 23, 8 -; PPC32-NEXT: lbz 26, 83(1) +; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: lbz 21, 135(1) +; PPC32-NEXT: addi 6, 6, 1 +; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill ; PPC32-NEXT: add 9, 22, 9 +; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 10, 21, 10 +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: lbz 25, 83(1) +; PPC32-NEXT: add 7, 20, 8 ; PPC32-NEXT: lbz 21, 147(1) +; PPC32-NEXT: addi 7, 7, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: add 26, 21, 26 -; PPC32-NEXT: lbz 25, 79(1) -; PPC32-NEXT: lbz 24, 75(1) -; PPC32-NEXT: lbz 23, 139(1) +; PPC32-NEXT: addi 4, 4, 1 +; PPC32-NEXT: lbz 24, 79(1) +; PPC32-NEXT: add 25, 21, 25 ; PPC32-NEXT: lbz 22, 143(1) -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: add 24, 23, 24 -; PPC32-NEXT: lbz 29, 95(1) -; PPC32-NEXT: add 25, 22, 25 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 23, 75(1) +; PPC32-NEXT: add 24, 22, 24 +; PPC32-NEXT: lbz 8, 139(1) +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 28, 95(1) +; PPC32-NEXT: add 8, 8, 23 ; PPC32-NEXT: lbz 21, 159(1) +; PPC32-NEXT: addi 8, 8, 1 ; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: add 29, 21, 29 -; PPC32-NEXT: lbz 28, 91(1) -; PPC32-NEXT: lbz 27, 87(1) -; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 27, 91(1) +; PPC32-NEXT: add 28, 21, 28 ; PPC32-NEXT: lbz 22, 155(1) -; PPC32-NEXT: lbz 4, 111(1) -; PPC32-NEXT: add 27, 23, 27 +; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 26, 87(1) +; PPC32-NEXT: add 27, 22, 27 +; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 11, 111(1) ; PPC32-NEXT: lbz 21, 175(1) -; PPC32-NEXT: add 28, 22, 28 -; PPC32-NEXT: lbz 11, 107(1) -; PPC32-NEXT: lbz 12, 171(1) -; PPC32-NEXT: add 4, 21, 4 +; PPC32-NEXT: add 26, 23, 26 +; PPC32-NEXT: lbz 12, 107(1) +; PPC32-NEXT: lbz 0, 171(1) +; PPC32-NEXT: add 11, 21, 11 ; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 4, 4, 1 -; PPC32-NEXT: lbz 0, 103(1) -; PPC32-NEXT: add 11, 12, 11 -; PPC32-NEXT: lbz 30, 99(1) -; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: addi 11, 11, 1 +; PPC32-NEXT: lbz 30, 103(1) +; PPC32-NEXT: add 12, 0, 12 ; PPC32-NEXT: lbz 22, 167(1) -; PPC32-NEXT: add 30, 23, 30 -; PPC32-NEXT: stb 4, 15(3) -; PPC32-NEXT: add 23, 22, 0 -; PPC32-NEXT: addi 4, 11, 1 -; PPC32-NEXT: stb 4, 14(3) -; PPC32-NEXT: addi 4, 23, 1 -; PPC32-NEXT: stb 4, 13(3) -; PPC32-NEXT: addi 4, 30, 1 -; PPC32-NEXT: stb 4, 12(3) -; PPC32-NEXT: addi 4, 29, 1 -; PPC32-NEXT: stb 4, 11(3) -; PPC32-NEXT: addi 4, 28, 1 -; PPC32-NEXT: stb 4, 10(3) -; PPC32-NEXT: addi 4, 27, 1 -; PPC32-NEXT: stb 4, 9(3) -; PPC32-NEXT: addi 4, 26, 1 -; PPC32-NEXT: stb 4, 8(3) -; PPC32-NEXT: addi 4, 25, 1 -; PPC32-NEXT: stb 4, 7(3) -; PPC32-NEXT: addi 4, 24, 1 -; PPC32-NEXT: stb 4, 6(3) -; PPC32-NEXT: addi 4, 10, 1 -; PPC32-NEXT: stb 4, 5(3) -; PPC32-NEXT: addi 4, 9, 1 -; PPC32-NEXT: stb 4, 4(3) -; PPC32-NEXT: addi 4, 8, 1 -; PPC32-NEXT: stb 4, 3(3) -; PPC32-NEXT: addi 4, 7, 1 -; PPC32-NEXT: stb 4, 2(3) -; PPC32-NEXT: addi 4, 6, 1 -; PPC32-NEXT: stb 4, 1(3) -; PPC32-NEXT: addi 4, 5, 1 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 29, 99(1) +; PPC32-NEXT: add 30, 22, 30 +; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: stb 11, 15(3) +; PPC32-NEXT: addi 11, 12, 1 +; PPC32-NEXT: add 29, 23, 29 +; PPC32-NEXT: stb 11, 14(3) +; PPC32-NEXT: addi 11, 30, 1 +; PPC32-NEXT: stb 11, 13(3) +; PPC32-NEXT: addi 11, 29, 1 +; PPC32-NEXT: stb 11, 12(3) +; PPC32-NEXT: addi 11, 28, 1 +; PPC32-NEXT: stb 11, 11(3) +; PPC32-NEXT: addi 11, 27, 1 +; PPC32-NEXT: stb 11, 10(3) +; PPC32-NEXT: addi 11, 26, 1 +; PPC32-NEXT: stb 11, 9(3) +; PPC32-NEXT: addi 11, 25, 1 +; PPC32-NEXT: stb 8, 6(3) +; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 11, 8(3) +; PPC32-NEXT: addi 11, 24, 1 +; PPC32-NEXT: stb 8, 5(3) +; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 11, 7(3) +; PPC32-NEXT: stb 8, 4(3) +; PPC32-NEXT: stb 7, 3(3) +; PPC32-NEXT: stb 6, 2(3) +; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload @@ -158,6 +159,7 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll b/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll index abda0c897a7cf..43aeb3655c5de 100644 --- a/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll +++ b/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll @@ -15,13 +15,13 @@ define dso_local void @test(ptr nocapture %fp, i32 signext %Arg, i32 signext %Le ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: stdu r1, -64(r1) ; CHECK-NEXT: cmpwi r5, 1 -; CHECK-NEXT: mr r30, r4 +; CHECK-NEXT: mr r29, r4 ; CHECK-NEXT: std r2, 24(r1) ; CHECK-NEXT: std r0, 80(r1) -; CHECK-NEXT: mr r29, r3 +; CHECK-NEXT: mr r30, r3 ; CHECK-NEXT: bc 12, lt, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: cmpwi r30, 11 +; CHECK-NEXT: cmpwi r29, 11 ; CHECK-NEXT: bc 12, lt, .LBB0_4 ; CHECK-NEXT: # %bb.2: # %for.body.us.preheader ; CHECK-NEXT: addi r3, r5, -1 @@ -30,18 +30,18 @@ define dso_local void @test(ptr nocapture %fp, i32 signext %Arg, i32 signext %Le ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_3: # %for.body.us ; CHECK-NEXT: # -; CHECK-NEXT: mtctr r29 -; CHECK-NEXT: mr r3, r30 -; CHECK-NEXT: mr r12, r29 +; CHECK-NEXT: mtctr r30 +; CHECK-NEXT: mr r3, r29 +; CHECK-NEXT: mr r12, r30 ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r28, r28, -1 ; CHECK-NEXT: cmpldi r28, 0 ; CHECK-NEXT: bc 12, gt, .LBB0_3 ; CHECK-NEXT: .LBB0_4: # %for.cond.cleanup -; CHECK-NEXT: mtctr r29 -; CHECK-NEXT: mr r3, r30 -; CHECK-NEXT: mr r12, r29 +; CHECK-NEXT: mtctr r30 +; CHECK-NEXT: mr r3, r29 +; CHECK-NEXT: mr r12, r30 ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r1, r1, 64 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index e2bbb04cf532a..48098e3a277c1 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -36,68 +36,68 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 7, 6 ; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill ; PPC32-NEXT: mfcr 12 ; PPC32-NEXT: cmpwi 7, 5, 0 ; PPC32-NEXT: cmpwi 2, 7, 0 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 5, 8 ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: crnor 20, 30, 10 -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 7, 9, 0 ; PPC32-NEXT: mulhwu. 26, 3, 10 ; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 2, 3, 0 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill ; PPC32-NEXT: crnor 21, 30, 10 -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 9, 4 -; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill ; PPC32-NEXT: crorc 20, 20, 6 -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill ; PPC32-NEXT: crorc 21, 21, 26 -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 0, 6, 10 +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 30, 6, 10 ; PPC32-NEXT: stw 12, 20(1) ; PPC32-NEXT: crorc 20, 20, 22 ; PPC32-NEXT: crorc 21, 21, 2 ; PPC32-NEXT: li 11, 0 ; PPC32-NEXT: mullw 26, 5, 10 -; PPC32-NEXT: addc 0, 26, 0 +; PPC32-NEXT: addc 30, 26, 30 ; PPC32-NEXT: mulhwu 29, 5, 10 ; PPC32-NEXT: addze 29, 29 ; PPC32-NEXT: mullw 23, 5, 8 ; PPC32-NEXT: mullw 22, 7, 6 -; PPC32-NEXT: mulhwu 30, 6, 9 +; PPC32-NEXT: mulhwu 0, 6, 9 ; PPC32-NEXT: mulhwu 12, 5, 9 -; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: mulhwu 27, 8, 6 ; PPC32-NEXT: mullw 25, 6, 9 ; PPC32-NEXT: mullw 24, 5, 9 ; PPC32-NEXT: mullw 5, 9, 4 ; PPC32-NEXT: add 9, 22, 23 -; PPC32-NEXT: add 9, 28, 9 -; PPC32-NEXT: cmplw 1, 9, 28 +; PPC32-NEXT: add 9, 27, 9 +; PPC32-NEXT: cmplw 1, 9, 27 ; PPC32-NEXT: cror 20, 20, 4 ; PPC32-NEXT: mullw 23, 3, 10 ; PPC32-NEXT: add 26, 23, 5 -; PPC32-NEXT: addc 5, 25, 0 -; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: addc 5, 25, 30 +; PPC32-NEXT: addze 0, 0 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mulhwu 27, 4, 10 +; PPC32-NEXT: mulhwu 28, 4, 10 ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: addc 3, 29, 30 -; PPC32-NEXT: add 26, 27, 26 -; PPC32-NEXT: cmplw 6, 26, 27 +; PPC32-NEXT: addc 3, 29, 0 +; PPC32-NEXT: add 26, 28, 26 +; PPC32-NEXT: cmplw 6, 26, 28 ; PPC32-NEXT: cror 21, 21, 24 -; PPC32-NEXT: mullw 0, 4, 10 +; PPC32-NEXT: mullw 30, 4, 10 ; PPC32-NEXT: or. 4, 8, 7 ; PPC32-NEXT: addze 4, 11 ; PPC32-NEXT: addc 7, 24, 3 ; PPC32-NEXT: crnor 22, 2, 6 -; PPC32-NEXT: mullw 28, 8, 6 +; PPC32-NEXT: mullw 27, 8, 6 ; PPC32-NEXT: adde 8, 12, 4 -; PPC32-NEXT: addc 3, 0, 28 +; PPC32-NEXT: addc 3, 30, 27 ; PPC32-NEXT: adde 9, 26, 9 ; PPC32-NEXT: addc 4, 7, 3 ; PPC32-NEXT: adde 3, 8, 9 diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll index 98314a02c23fe..a2ad2946cc8ec 100644 --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -897,31 +897,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; P8LE-NEXT: mfvsrd r6, v2 ; P8LE-NEXT: mfvsrd r8, v3 ; P8LE-NEXT: ori r3, r3, 51289 -; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: ori r5, r5, 42889 -; P8LE-NEXT: rldic r3, r3, 36, 1 +; P8LE-NEXT: rldic r4, r3, 36, 1 +; P8LE-NEXT: mffprd r3, f0 ; P8LE-NEXT: rldic r5, r5, 35, 1 ; P8LE-NEXT: rldicl r7, r6, 63, 1 -; P8LE-NEXT: oris r3, r3, 45590 +; P8LE-NEXT: oris r4, r4, 45590 ; P8LE-NEXT: oris r5, r5, 1603 -; P8LE-NEXT: ori r3, r3, 17097 +; P8LE-NEXT: ori r4, r4, 17097 ; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhdu r3, r4, r3 +; P8LE-NEXT: mulhdu r4, r3, r4 ; P8LE-NEXT: mulhdu r5, r7, r5 -; P8LE-NEXT: sub r7, r4, r3 +; P8LE-NEXT: sub r7, r3, r4 ; P8LE-NEXT: rldicl r5, r5, 57, 7 ; P8LE-NEXT: rldicl r7, r7, 63, 1 ; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: add r3, r7, r3 +; P8LE-NEXT: add r4, r7, r4 ; P8LE-NEXT: lis r7, -16037 ; P8LE-NEXT: ori r7, r7, 28749 -; P8LE-NEXT: rldicl r3, r3, 60, 4 +; P8LE-NEXT: rldicl r4, r4, 60, 4 ; P8LE-NEXT: sub r5, r6, r5 ; P8LE-NEXT: rldic r7, r7, 32, 0 -; P8LE-NEXT: mulli r3, r3, 23 +; P8LE-NEXT: mulli r4, r4, 23 ; P8LE-NEXT: oris r7, r7, 52170 ; P8LE-NEXT: ori r7, r7, 12109 -; P8LE-NEXT: sub r3, r4, r3 +; P8LE-NEXT: sub r3, r3, r4 ; P8LE-NEXT: mulhdu r7, r8, r7 ; P8LE-NEXT: mtfprd f1, r3 ; P8LE-NEXT: li r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll index 07a8fb06caa11..cc38f921b117b 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll @@ -169,50 +169,50 @@ define <8 x i16> @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v2, v2, v5 +; CHECK-P8-NEXT: vmrghh v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 -; CHECK-P8-NEXT: vmrghh v3, v3, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 +; CHECK-P8-NEXT: vmrghh v2, v2, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -329,43 +329,43 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs2 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -388,7 +388,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: vmrghh v1, v1, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v2, v2, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 @@ -398,7 +398,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v6, v6, v7 @@ -407,16 +407,16 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 -; CHECK-P8-NEXT: vmrghh v4, v4, v7 +; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: xxmrglw vs2, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 @@ -424,8 +424,8 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v5, v5, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v5, v7 +; CHECK-P8-NEXT: vmrghh v4, v4, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 @@ -534,38 +534,38 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs2, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI3_0@toc@ha -; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI3_0@toc@l -; CHECK-BE-NEXT: lxv vs3, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs2 -; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs1 -; CHECK-BE-NEXT: xxswapd vs5, vs2 +; CHECK-BE-NEXT: lxv vs0, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs3 +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs2 +; CHECK-BE-NEXT: xxswapd vs5, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs0 +; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs1 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -576,50 +576,50 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xscvspdpn f5, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: mffprwz r5, f3 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xscvspdpn f5, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: mtfprwz f3, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 +; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f1 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f0, f0 -; CHECK-BE-NEXT: mtfprwz f1, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 +; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 -; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f0, r4 -; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f1, r4 +; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: mtfprwz f2, r4 +; CHECK-BE-NEXT: mffprwz r4, f3 +; CHECK-BE-NEXT: mtfprwz f3, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -629,9 +629,9 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 -; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: @@ -801,50 +801,50 @@ define <8 x i16> @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v2, v2, v5 +; CHECK-P8-NEXT: vmrghh v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 -; CHECK-P8-NEXT: vmrghh v3, v3, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 +; CHECK-P8-NEXT: vmrghh v2, v2, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -961,43 +961,43 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs2 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -1020,7 +1020,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: vmrghh v1, v1, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v2, v2, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 @@ -1030,7 +1030,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v6, v6, v7 @@ -1039,16 +1039,16 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 -; CHECK-P8-NEXT: vmrghh v4, v4, v7 +; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: xxmrglw vs2, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 @@ -1056,8 +1056,8 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v5, v5, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v5, v7 +; CHECK-P8-NEXT: vmrghh v4, v4, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 @@ -1166,38 +1166,38 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs2, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l -; CHECK-BE-NEXT: lxv vs3, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs2 -; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs1 -; CHECK-BE-NEXT: xxswapd vs5, vs2 +; CHECK-BE-NEXT: lxv vs0, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs3 +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs2 +; CHECK-BE-NEXT: xxswapd vs5, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs0 +; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs1 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -1208,50 +1208,50 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xscvspdpn f5, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: mffprwz r5, f3 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xscvspdpn f5, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: mtfprwz f3, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 +; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f1 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f0, f0 -; CHECK-BE-NEXT: mtfprwz f1, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 +; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 -; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f0, r4 -; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f1, r4 +; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: mtfprwz f2, r4 +; CHECK-BE-NEXT: mffprwz r4, f3 +; CHECK-BE-NEXT: mtfprwz f3, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -1261,9 +1261,9 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 -; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll index dfa49a8278157..c6e808d145ebb 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -178,51 +178,51 @@ define i64 @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v5 +; CHECK-P8-NEXT: vmrghb v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v2, v2, v4 +; CHECK-P8-NEXT: vmrglh v3, v3, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v3, v3, v0 -; CHECK-P8-NEXT: vmrglh v3, v3, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 +; CHECK-P8-NEXT: vmrghb v2, v2, v0 +; CHECK-P8-NEXT: vmrglh v2, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -343,47 +343,47 @@ entry: define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xxswapd v4, vs8 ; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -400,7 +400,7 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v1, v1, v6 @@ -414,32 +414,32 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: vmrglh v2, v2, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: vmrghb v4, v4, v7 +; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: vmrglh v4, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: vmrglh v5, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v5, v5, v8 -; CHECK-P8-NEXT: vmrglh v5, v5, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v4 +; CHECK-P8-NEXT: vmrghb v4, v4, v8 +; CHECK-P8-NEXT: vmrglh v4, v4, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -818,51 +818,51 @@ define i64 @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v5 +; CHECK-P8-NEXT: vmrghb v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v2, v2, v4 +; CHECK-P8-NEXT: vmrglh v3, v3, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v3, v3, v0 -; CHECK-P8-NEXT: vmrglh v3, v3, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 +; CHECK-P8-NEXT: vmrghb v2, v2, v0 +; CHECK-P8-NEXT: vmrglh v2, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -983,47 +983,47 @@ entry: define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xxswapd v4, vs8 ; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -1040,7 +1040,7 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v1, v1, v6 @@ -1054,32 +1054,32 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: vmrglh v2, v2, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: vmrghb v4, v4, v7 +; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: vmrglh v4, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: vmrglh v5, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v5, v5, v8 -; CHECK-P8-NEXT: vmrglh v5, v5, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v4 +; CHECK-P8-NEXT: vmrghb v4, v4, v8 +; CHECK-P8-NEXT: vmrglh v4, v4, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll index 5dcb2e4be3e37..00ca205e85972 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -288,92 +288,92 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 -; CHECK-P8-NEXT: li r6, 96 ; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: li r6, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x v2, r4, r6 -; CHECK-P8-NEXT: xxswapd vs3, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 +; CHECK-P8-NEXT: xxswapd vs6, vs5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs10, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 -; CHECK-P8-NEXT: xxswapd vs10, vs8 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd vs13, vs11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd vs11, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 ; CHECK-P8-NEXT: mffprwz r4, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: mffprwz r4, f12 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: xxswapd v3, v2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: mfvsrwz r4, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f4 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: mfvsrwz r4, v3 ; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 -; CHECK-P8-NEXT: vmrghh v1, v8, v1 -; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 -; CHECK-P8-NEXT: vmrghh v6, v9, v6 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghh v2, v2, v1 +; CHECK-P8-NEXT: vmrghh v3, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: vmrghh v1, v9, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 +; CHECK-P8-NEXT: vmrghh v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: vmrghh v7, v10, v7 -; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 -; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: vmrghh v2, v3, v2 -; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: vmrghh v7, v7, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 +; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 @@ -835,92 +835,92 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 -; CHECK-P8-NEXT: li r6, 96 ; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: li r6, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x v2, r4, r6 -; CHECK-P8-NEXT: xxswapd vs3, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 +; CHECK-P8-NEXT: xxswapd vs6, vs5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs10, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 -; CHECK-P8-NEXT: xxswapd vs10, vs8 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd vs13, vs11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd vs11, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 ; CHECK-P8-NEXT: mffprwz r4, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: mffprwz r4, f12 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: xxswapd v3, v2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: mfvsrwz r4, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f4 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: mfvsrwz r4, v3 ; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 -; CHECK-P8-NEXT: vmrghh v1, v8, v1 -; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 -; CHECK-P8-NEXT: vmrghh v6, v9, v6 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghh v2, v2, v1 +; CHECK-P8-NEXT: vmrghh v3, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: vmrghh v1, v9, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 +; CHECK-P8-NEXT: vmrghh v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: vmrghh v7, v10, v7 -; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 -; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: vmrghh v2, v3, v2 -; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: vmrghh v7, v7, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 +; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll index dd5cb59a48bf0..770689ba98049 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -303,90 +303,90 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 -; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 -; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x v2, r3, r4 -; CHECK-P8-NEXT: xxswapd vs2, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs10, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs7, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: xxswapd vs5, vs4 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r3, f4 ; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 -; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: xxswapd vs13, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xxswapd vs10, vs6 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r3, f6 -; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f8 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd v3, v2 -; CHECK-P8-NEXT: vmrghb v4, v8, v4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 +; CHECK-P8-NEXT: mffprwz r3, f12 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f5 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: vmrghb v5, v9, v5 -; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 ; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: vmrghb v0, v8, v0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f13 -; CHECK-P8-NEXT: vmrghb v1, v9, v1 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: vmrghb v6, v8, v6 +; CHECK-P8-NEXT: mfvsrwz r3, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: mfvsrwz r3, v3 +; CHECK-P8-NEXT: vmrglh v4, v5, v4 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghb v2, v2, v1 +; CHECK-P8-NEXT: vmrghb v1, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: vmrglh v2, v2, v0 +; CHECK-P8-NEXT: vmrghb v3, v3, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: vmrglh v3, v3, v1 +; CHECK-P8-NEXT: vmrghb v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: vmrghb v7, v9, v7 -; CHECK-P8-NEXT: mtvsrd v9, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 -; CHECK-P8-NEXT: vmrghb v8, v8, v9 -; CHECK-P8-NEXT: vmrghb v2, v3, v2 -; CHECK-P8-NEXT: vmrglh v3, v5, v4 -; CHECK-P8-NEXT: vmrglh v4, v1, v0 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: vmrghb v7, v7, v8 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: vmrglh v2, v2, v8 -; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -858,90 +858,90 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 -; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 -; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x v2, r3, r4 -; CHECK-P8-NEXT: xxswapd vs2, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs10, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs7, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: xxswapd vs5, vs4 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r3, f4 ; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 -; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: xxswapd vs13, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xxswapd vs10, vs6 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r3, f6 -; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f8 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd v3, v2 -; CHECK-P8-NEXT: vmrghb v4, v8, v4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 +; CHECK-P8-NEXT: mffprwz r3, f12 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f5 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: vmrghb v5, v9, v5 -; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 ; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: vmrghb v0, v8, v0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f13 -; CHECK-P8-NEXT: vmrghb v1, v9, v1 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: vmrghb v6, v8, v6 +; CHECK-P8-NEXT: mfvsrwz r3, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: mfvsrwz r3, v3 +; CHECK-P8-NEXT: vmrglh v4, v5, v4 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghb v2, v2, v1 +; CHECK-P8-NEXT: vmrghb v1, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: vmrglh v2, v2, v0 +; CHECK-P8-NEXT: vmrghb v3, v3, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: vmrglh v3, v3, v1 +; CHECK-P8-NEXT: vmrghb v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: vmrghb v7, v9, v7 -; CHECK-P8-NEXT: mtvsrd v9, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 -; CHECK-P8-NEXT: vmrghb v8, v8, v9 -; CHECK-P8-NEXT: vmrghb v2, v3, v2 -; CHECK-P8-NEXT: vmrglh v3, v5, v4 -; CHECK-P8-NEXT: vmrglh v4, v1, v0 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: vmrghb v7, v7, v8 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: vmrglh v2, v2, v8 -; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 9ebc0d2c171ba..4d75a74f06ac2 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -129,16 +129,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a4, a3, a0 +; RV64IA-NEXT: lw a3, 0(a2) ; RV64IA-NEXT: andi a0, a0, 24 -; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB0_3 Depth 2 -; RV64IA-NEXT: srlw a5, a4, a0 -; RV64IA-NEXT: sext.w a6, a4 +; RV64IA-NEXT: srlw a5, a3, a0 +; RV64IA-NEXT: sext.w a6, a3 ; RV64IA-NEXT: andi a7, a5, 255 ; RV64IA-NEXT: addiw a5, a5, 1 ; RV64IA-NEXT: sltu a7, a7, a1 @@ -146,20 +146,20 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: and a5, a7, a5 ; RV64IA-NEXT: andi a5, a5, 255 ; RV64IA-NEXT: sllw a5, a5, a0 -; RV64IA-NEXT: and a4, a4, a3 -; RV64IA-NEXT: or a5, a4, a5 +; RV64IA-NEXT: and a3, a3, a4 +; RV64IA-NEXT: or a5, a3, a5 ; RV64IA-NEXT: .LBB0_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB0_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a6, .LBB0_1 +; RV64IA-NEXT: lr.w.aqrl a3, (a2) +; RV64IA-NEXT: bne a3, a6, .LBB0_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB0_3 Depth=2 ; RV64IA-NEXT: sc.w.rl a7, a5, (a2) ; RV64IA-NEXT: bnez a7, .LBB0_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a3, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -290,19 +290,19 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_uinc_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 +; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a5, 0(a2) -; RV64IA-NEXT: sllw a4, a3, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a5, a3, a5 +; RV64IA-NEXT: not a5, a5 ; RV64IA-NEXT: and a1, a1, a3 ; RV64IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB1_3 Depth 2 -; RV64IA-NEXT: srlw a6, a5, a0 -; RV64IA-NEXT: sext.w a7, a5 +; RV64IA-NEXT: srlw a6, a4, a0 +; RV64IA-NEXT: sext.w a7, a4 ; RV64IA-NEXT: and t0, a6, a3 ; RV64IA-NEXT: addiw a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 @@ -310,20 +310,20 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: and a6, a6, a3 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 -; RV64IA-NEXT: and a5, a5, a4 -; RV64IA-NEXT: or a6, a5, a6 +; RV64IA-NEXT: and a4, a4, a5 +; RV64IA-NEXT: or a6, a4, a6 ; RV64IA-NEXT: .LBB1_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB1_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NEXT: bne a5, a7, .LBB1_1 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a7, .LBB1_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB1_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a6, (a2) ; RV64IA-NEXT: bnez t0, .LBB1_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a5, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -776,37 +776,37 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a3, a0, 3 -; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: slli a4, a0, 3 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: li a5, 255 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a3, a5, a3 -; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a3, 0(a2) +; RV64IA-NEXT: sllw a4, a5, a4 +; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a5, a1, 255 ; RV64IA-NEXT: j .LBB4_2 ; RV64IA-NEXT: .LBB4_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: sext.w a6, a4 +; RV64IA-NEXT: sext.w a6, a3 ; RV64IA-NEXT: andi a7, a7, 255 ; RV64IA-NEXT: sllw a7, a7, a0 -; RV64IA-NEXT: and a4, a4, a3 -; RV64IA-NEXT: or a7, a4, a7 +; RV64IA-NEXT: and a3, a3, a4 +; RV64IA-NEXT: or a7, a3, a7 ; RV64IA-NEXT: .LBB4_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB4_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a6, .LBB4_7 +; RV64IA-NEXT: lr.w.aqrl a3, (a2) +; RV64IA-NEXT: bne a3, a6, .LBB4_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a7, (a2) ; RV64IA-NEXT: bnez t0, .LBB4_5 ; RV64IA-NEXT: .LBB4_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: beq a4, a6, .LBB4_4 +; RV64IA-NEXT: beq a3, a6, .LBB4_4 ; RV64IA-NEXT: .LBB4_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB4_5 Depth 2 -; RV64IA-NEXT: srlw a6, a4, a0 +; RV64IA-NEXT: srlw a6, a3, a0 ; RV64IA-NEXT: andi a7, a6, 255 ; RV64IA-NEXT: seqz t0, a7 ; RV64IA-NEXT: sltu a7, a5, a7 @@ -818,7 +818,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: addi a7, a6, -1 ; RV64IA-NEXT: j .LBB4_1 ; RV64IA-NEXT: .LBB4_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a3, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -983,38 +983,38 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 +; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a5, 0(a2) -; RV64IA-NEXT: sllw a4, a3, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a5, a3, a5 +; RV64IA-NEXT: not a5, a5 ; RV64IA-NEXT: and a6, a1, a3 ; RV64IA-NEXT: j .LBB5_2 ; RV64IA-NEXT: .LBB5_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: sext.w a7, a5 +; RV64IA-NEXT: sext.w a7, a4 ; RV64IA-NEXT: and t0, t0, a3 ; RV64IA-NEXT: sllw t0, t0, a0 -; RV64IA-NEXT: and a5, a5, a4 -; RV64IA-NEXT: or t0, a5, t0 +; RV64IA-NEXT: and a4, a4, a5 +; RV64IA-NEXT: or t0, a4, t0 ; RV64IA-NEXT: .LBB5_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB5_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NEXT: bne a5, a7, .LBB5_7 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a7, .LBB5_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t1, t0, (a2) ; RV64IA-NEXT: bnez t1, .LBB5_5 ; RV64IA-NEXT: .LBB5_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: beq a5, a7, .LBB5_4 +; RV64IA-NEXT: beq a4, a7, .LBB5_4 ; RV64IA-NEXT: .LBB5_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB5_5 Depth 2 -; RV64IA-NEXT: srlw a7, a5, a0 +; RV64IA-NEXT: srlw a7, a4, a0 ; RV64IA-NEXT: and t0, a7, a3 ; RV64IA-NEXT: seqz t1, t0 ; RV64IA-NEXT: sltu t0, a6, t0 @@ -1026,7 +1026,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: addi t0, a7, -1 ; RV64IA-NEXT: j .LBB5_1 ; RV64IA-NEXT: .LBB5_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a5, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst ret i16 %result diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll index cbe12187a4110..4f7736e318cae 100644 --- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll +++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll @@ -1204,28 +1204,28 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t0, 5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t0, 212(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t1, 216(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, 216(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, 212(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t1, 6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t1, 204(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t2, 208(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, 208(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, 204(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t2, 7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t2, 196(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t3, 200(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, 200(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, 196(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s0, 8 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s0, 188(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s1, 192(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, 192(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 188(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s1, 9 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s1, 180(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s2, 184(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 184(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, 180(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a0, 10 ; CHECK-RV32-NEXT: #NO_APP @@ -1233,83 +1233,83 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a1, 11 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a1, 168(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a2, 172(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, 172(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, 168(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a2, 12 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a2, 160(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a3, 164(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, 164(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, 160(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a3, 13 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a3, 152(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a4, 156(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, 156(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, 152(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a4, 14 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a4, 144(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a5, 148(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, 148(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, 144(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a5, 15 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a5, 136(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a6, 140(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, 140(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, 136(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a6, 16 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a6, 128(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a7, 132(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, 132(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, 128(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a7, 17 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a7, 120(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t0, 124(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, 124(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, 120(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s2, 18 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s3, 116(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, 112(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s3, 19 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s3, 104(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s4, 108(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, 104(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s4, 20 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s4, 96(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, 100(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, 96(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s5, 21 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s6, 92(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, 92(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, 88(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s6, 22 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s6, 80(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s7, 84(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, 80(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s7, 23 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s7, 72(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, 76(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, 72(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s8, 24 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s8, 64(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s9, 68(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, 68(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, 64(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s9, 25 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s9, 56(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s10, 60(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, 60(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, 56(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s10, 26 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s10, 48(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s11, 52(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, 52(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, 48(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s11, 27 ; CHECK-RV32-NEXT: #NO_APP @@ -1317,13 +1317,13 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t3, 28 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t3, 36(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t4, 40(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, 40(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, 36(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t4, 29 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t4, 28(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t5, 32(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, 32(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, 28(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t5, 30 ; CHECK-RV32-NEXT: #NO_APP @@ -1331,13 +1331,12 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t6, 31 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: mv a2, t6 -; CHECK-RV32-NEXT: mv t6, a1 -; CHECK-RV32-NEXT: sw s0, 20(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a1, a1, s0 -; CHECK-RV32-NEXT: sw t5, 24(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a2, 16(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: xor a2, t5, a2 +; CHECK-RV32-NEXT: sw t6, 20(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, 16(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: xor a2, t5, t6 ; CHECK-RV32-NEXT: or a1, a2, a1 ; CHECK-RV32-NEXT: beqz a1, .LBB4_1 ; CHECK-RV32-NEXT: # %bb.3: @@ -1350,28 +1349,28 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use ra ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t0, 212(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t1, 216(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, 216(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, 212(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t1, 204(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t2, 208(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, 208(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, 204(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t1 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t2, 196(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t3, 200(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, 200(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, 196(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s0, 188(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s1, 192(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, 192(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 188(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s1, 180(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s2, 184(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 184(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, 180(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s1 ; CHECK-RV32-NEXT: #NO_APP @@ -1379,83 +1378,83 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a1, 168(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a2, 172(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, 172(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, 168(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a1 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a2, 160(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a3, 164(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, 164(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, 160(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a3, 152(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a4, 156(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, 156(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, 152(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a4, 144(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a5, 148(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, 148(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, 144(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a5, 136(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a6, 140(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, 140(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, 136(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a6, 128(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a7, 132(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, 132(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, 128(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a7, 120(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t0, 124(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, 124(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, 120(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s3, 116(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, 112(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s3, 104(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s4, 108(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, 104(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s4, 96(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, 100(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, 96(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s6, 92(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, 92(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, 88(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s6, 80(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s7, 84(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, 80(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s7, 72(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, 76(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, 72(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s8, 64(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s9, 68(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, 68(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, 64(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s8 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s9, 56(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s10, 60(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, 60(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, 56(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s9 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s10, 48(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s11, 52(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, 52(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, 48(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s10 ; CHECK-RV32-NEXT: #NO_APP @@ -1463,22 +1462,23 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s11 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t3, 36(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t4, 40(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, 40(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, 36(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t4, 28(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t5, 32(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, 32(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, 28(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t5, 24(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, 16(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, 24(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t6, 16(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s0, 20(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, 20(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t6 ; CHECK-RV32-NEXT: #NO_APP @@ -1839,46 +1839,46 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t0, -8(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -4(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -4(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -8(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t1, 6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -16(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -12(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -12(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -16(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t2, 7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -24(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -20(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t3, -20(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -24(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s0, 8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s0, -32(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, -28(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -28(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -32(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s1, 9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -40(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -36(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s2, -36(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -40(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a0, 10 ; CHECK-RV32-NEXT: #NO_APP @@ -1890,145 +1890,145 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a3, 1 ; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw a1, -52(a3) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, -48(a3) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -48(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -52(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a2, 12 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -60(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -56(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -56(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -60(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a3, 13 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -68(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -64(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -64(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -68(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a4, 14 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -76(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -72(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -72(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -76(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a5, 15 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -84(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -80(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -80(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -84(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a6, 16 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -92(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -88(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -88(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -92(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a7, 17 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -100(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -96(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t0, -96(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -100(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s2, 18 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s2, -108(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -104(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -104(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -108(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s3, 19 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -116(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -112(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -112(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -116(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s4, 20 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -124(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -120(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -120(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -124(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s5, 21 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -132(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -128(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -128(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -132(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s6, 22 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -140(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -136(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -136(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -140(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s7, 23 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -148(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -144(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -144(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -148(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s8, 24 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -156(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -152(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -152(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -156(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s9, 25 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -164(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -160(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -160(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -164(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s10, 26 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -172(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -168(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s11, -168(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, -172(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s11, 27 ; CHECK-RV32-NEXT: #NO_APP @@ -2040,19 +2040,19 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t3, -184(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -180(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -180(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -184(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t4, 29 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -192(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -188(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t5, -188(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, -192(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t5, 30 ; CHECK-RV32-NEXT: #NO_APP @@ -2060,19 +2060,20 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t6, 31 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: mv a2, t6 -; CHECK-RV32-NEXT: mv t6, a1 -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw s0, -204(a3) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw s0, -208(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw a1, -196(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a1, a1, s0 -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw t5, -196(a3) # 4-byte Folded Spill -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw a2, -200(a3) # 4-byte Folded Spill -; CHECK-RV32-NEXT: xor a2, t5, a2 +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw t6, -200(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw t5, -204(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: xor a2, t5, t6 ; CHECK-RV32-NEXT: or a1, a2, a1 ; CHECK-RV32-NEXT: beqz a1, .LBB5_1 ; CHECK-RV32-NEXT: # %bb.3: @@ -2087,46 +2088,46 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t0, -8(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -4(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -4(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -8(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -16(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -12(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -12(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -16(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -24(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -20(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t3, -20(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -24(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s0, -32(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -28(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -28(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -32(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -40(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -36(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s2, -36(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -40(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s1 ; CHECK-RV32-NEXT: #NO_APP @@ -2138,145 +2139,145 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a1, -52(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, -48(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -48(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -52(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -60(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -56(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -56(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -60(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -68(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -64(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -64(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -68(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -76(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -72(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -72(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -76(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -84(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -80(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -80(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -84(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -92(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -88(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -88(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -92(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -100(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -96(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t0, -96(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -100(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s2, -108(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -104(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -104(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -108(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -116(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -112(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -112(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -116(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -124(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -120(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -120(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -124(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -132(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -128(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -128(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -132(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -140(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -136(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -136(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -140(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -148(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -144(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -144(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -148(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -156(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -152(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -152(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -156(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -164(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -160(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -160(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -164(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -172(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -168(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s11, -168(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, -172(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s10 ; CHECK-RV32-NEXT: #NO_APP @@ -2288,31 +2289,34 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t3, -184(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -180(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -180(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -184(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -192(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -188(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -188(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, -192(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -196(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, -204(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lui a0, 1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: lw t6, -196(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s0, -204(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -208(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: lw t6, -200(a0) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index 30374c13d60fe..ca290988b58ca 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -50,16 +50,16 @@ define void @callee() nounwind { ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var) -; RV32I-NEXT: lw a0, %lo(var)(a7) +; RV32I-NEXT: lui a6, %hi(var) +; RV32I-NEXT: lw a0, %lo(var)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-NEXT: lw a0, %lo(var+4)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-NEXT: lw a0, %lo(var+8)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-NEXT: lw a0, %lo(var+12)(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var) +; RV32I-NEXT: addi a5, a6, %lo(var) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -84,7 +84,7 @@ define void @callee() nounwind { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -95,7 +95,7 @@ define void @callee() nounwind { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -121,13 +121,13 @@ define void @callee() nounwind { ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-NEXT: sw a0, %lo(var+12)(a6) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-NEXT: sw a0, %lo(var+8)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-NEXT: sw a0, %lo(var+4)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var)(a7) +; RV32I-NEXT: sw a0, %lo(var)(a6) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -161,16 +161,16 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: addi s0, sp, 80 -; RV32I-WITH-FP-NEXT: lui a7, %hi(var) -; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV32I-WITH-FP-NEXT: lui a6, %hi(var) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV32I-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV32I-WITH-FP-NEXT: lw a0, 16(a5) ; RV32I-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: lw a0, 20(a5) @@ -196,7 +196,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw s11, 92(a5) ; RV32I-WITH-FP-NEXT: lw ra, 96(a5) ; RV32I-WITH-FP-NEXT: lw t0, 100(a5) -; RV32I-WITH-FP-NEXT: lw a6, 104(a5) +; RV32I-WITH-FP-NEXT: lw a7, 104(a5) ; RV32I-WITH-FP-NEXT: lw a4, 108(a5) ; RV32I-WITH-FP-NEXT: lw a0, 124(a5) ; RV32I-WITH-FP-NEXT: lw a1, 120(a5) @@ -207,7 +207,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw a2, 116(a5) ; RV32I-WITH-FP-NEXT: sw a3, 112(a5) ; RV32I-WITH-FP-NEXT: sw a4, 108(a5) -; RV32I-WITH-FP-NEXT: sw a6, 104(a5) +; RV32I-WITH-FP-NEXT: sw a7, 104(a5) ; RV32I-WITH-FP-NEXT: sw t0, 100(a5) ; RV32I-WITH-FP-NEXT: sw ra, 96(a5) ; RV32I-WITH-FP-NEXT: sw s11, 92(a5) @@ -234,13 +234,13 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: sw a0, 16(a5) ; RV32I-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -260,16 +260,16 @@ define void @callee() nounwind { ; RV32IZCMP-LABEL: callee: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a7, %hi(var) -; RV32IZCMP-NEXT: lw a0, %lo(var)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var) +; RV32IZCMP-NEXT: lw a0, %lo(var)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a6) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -294,7 +294,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -305,7 +305,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -331,13 +331,13 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a6) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var)(a6) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV32IZCMP-WITH-FP-LABEL: callee: @@ -357,16 +357,16 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: addi s0, sp, 80 -; RV32IZCMP-WITH-FP-NEXT: lui a7, %hi(var) -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV32IZCMP-WITH-FP-NEXT: lui a6, %hi(var) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV32IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -392,7 +392,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a7, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -403,7 +403,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a7, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -430,13 +430,13 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -469,16 +469,16 @@ define void @callee() nounwind { ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var) -; RV64I-NEXT: lw a0, %lo(var)(a7) +; RV64I-NEXT: lui a6, %hi(var) +; RV64I-NEXT: lw a0, %lo(var)(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-NEXT: lw a0, %lo(var+8)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-NEXT: lw a0, %lo(var+12)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var) +; RV64I-NEXT: addi a5, a6, %lo(var) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -503,7 +503,7 @@ define void @callee() nounwind { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -514,7 +514,7 @@ define void @callee() nounwind { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -540,13 +540,13 @@ define void @callee() nounwind { ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-NEXT: sw a0, %lo(var+12)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-NEXT: sw a0, %lo(var+8)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-NEXT: sw a0, %lo(var+4)(a6) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var)(a7) +; RV64I-NEXT: sw a0, %lo(var)(a6) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -580,16 +580,16 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: addi s0, sp, 160 -; RV64I-WITH-FP-NEXT: lui a7, %hi(var) -; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV64I-WITH-FP-NEXT: lui a6, %hi(var) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV64I-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV64I-WITH-FP-NEXT: lw a0, 16(a5) ; RV64I-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: lw a0, 20(a5) @@ -615,7 +615,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: lw s11, 92(a5) ; RV64I-WITH-FP-NEXT: lw ra, 96(a5) ; RV64I-WITH-FP-NEXT: lw t0, 100(a5) -; RV64I-WITH-FP-NEXT: lw a6, 104(a5) +; RV64I-WITH-FP-NEXT: lw a7, 104(a5) ; RV64I-WITH-FP-NEXT: lw a4, 108(a5) ; RV64I-WITH-FP-NEXT: lw a0, 124(a5) ; RV64I-WITH-FP-NEXT: lw a1, 120(a5) @@ -626,7 +626,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sw a2, 116(a5) ; RV64I-WITH-FP-NEXT: sw a3, 112(a5) ; RV64I-WITH-FP-NEXT: sw a4, 108(a5) -; RV64I-WITH-FP-NEXT: sw a6, 104(a5) +; RV64I-WITH-FP-NEXT: sw a7, 104(a5) ; RV64I-WITH-FP-NEXT: sw t0, 100(a5) ; RV64I-WITH-FP-NEXT: sw ra, 96(a5) ; RV64I-WITH-FP-NEXT: sw s11, 92(a5) @@ -653,13 +653,13 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: sw a0, 16(a5) ; RV64I-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -679,16 +679,16 @@ define void @callee() nounwind { ; RV64IZCMP-LABEL: callee: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a7, %hi(var) -; RV64IZCMP-NEXT: lw a0, %lo(var)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var) +; RV64IZCMP-NEXT: lw a0, %lo(var)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a6) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -713,7 +713,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -724,7 +724,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -750,13 +750,13 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a6) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var)(a6) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV64IZCMP-WITH-FP-LABEL: callee: @@ -776,16 +776,16 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: addi s0, sp, 160 -; RV64IZCMP-WITH-FP-NEXT: lui a7, %hi(var) -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV64IZCMP-WITH-FP-NEXT: lui a6, %hi(var) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV64IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -811,7 +811,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a7, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -822,7 +822,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a7, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -849,13 +849,13 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 9d4ce80b0d544..066b6fe9c5348 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,31 +24,31 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index e3684af4f4de2..b091b0613c0f3 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -3148,19 +3148,19 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: lw a0, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) ; RV32IF-NEXT: lw a2, 20(sp) -; RV32IF-NEXT: lw a4, 16(sp) +; RV32IF-NEXT: lw a3, 16(sp) ; RV32IF-NEXT: beqz a2, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a3, a2, 0 +; RV32IF-NEXT: slti a4, a2, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: -; RV32IF-NEXT: seqz a3, a4 +; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry -; RV32IF-NEXT: xori a4, a4, 1 -; RV32IF-NEXT: or a4, a4, a2 -; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: addi a4, a4, -1 -; RV32IF-NEXT: and a3, a4, a3 +; RV32IF-NEXT: xori a3, a3, 1 +; RV32IF-NEXT: or a3, a3, a2 +; RV32IF-NEXT: seqz a3, a3 +; RV32IF-NEXT: addi a3, a3, -1 +; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 @@ -3206,19 +3206,19 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: lw a0, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) ; RV32IFD-NEXT: lw a2, 20(sp) -; RV32IFD-NEXT: lw a4, 16(sp) +; RV32IFD-NEXT: lw a3, 16(sp) ; RV32IFD-NEXT: beqz a2, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a3, a2, 0 +; RV32IFD-NEXT: slti a4, a2, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: -; RV32IFD-NEXT: seqz a3, a4 +; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry -; RV32IFD-NEXT: xori a4, a4, 1 -; RV32IFD-NEXT: or a4, a4, a2 -; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: addi a4, a4, -1 -; RV32IFD-NEXT: and a3, a4, a3 +; RV32IFD-NEXT: xori a3, a3, 1 +; RV32IFD-NEXT: or a3, a3, a2 +; RV32IFD-NEXT: seqz a3, a3 +; RV32IFD-NEXT: addi a3, a3, -1 +; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 @@ -3374,19 +3374,19 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a1, 12(sp) ; RV32-NEXT: lw a2, 20(sp) -; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a3, 16(sp) ; RV32-NEXT: beqz a2, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a3, a2, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: -; RV32-NEXT: seqz a3, a4 +; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry -; RV32-NEXT: xori a4, a4, 1 -; RV32-NEXT: or a4, a4, a2 -; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 @@ -3599,19 +3599,19 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a1, 12(sp) ; RV32-NEXT: lw a2, 20(sp) -; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a3, 16(sp) ; RV32-NEXT: beqz a2, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a3, a2, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: -; RV32-NEXT: seqz a3, a4 +; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry -; RV32-NEXT: xori a4, a4, 1 -; RV32-NEXT: or a4, a4, a2 -; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 341db9a1a172a..f2b7e8d26328d 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-LABEL: muli128_m63: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: slli a1, a2, 6 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: srli a6, a2, 26 -; RV32I-NEXT: slli t0, a3, 6 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: mv t0, a4 -; RV32I-NEXT: beq a3, a6, .LBB31_2 +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: slli a3, a2, 6 +; RV32I-NEXT: sltu a5, a2, a3 +; RV32I-NEXT: srli a7, a2, 26 +; RV32I-NEXT: slli t0, a1, 6 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: mv t0, a5 +; RV32I-NEXT: beq a1, a7, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a3, a6 +; RV32I-NEXT: sltu t0, a1, a7 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: srli t1, a3, 26 -; RV32I-NEXT: slli t2, a7, 6 +; RV32I-NEXT: srli t1, a1, 26 +; RV32I-NEXT: slli t2, a6, 6 ; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: sub t2, a7, t1 +; RV32I-NEXT: sub t2, a6, t1 ; RV32I-NEXT: sltu t3, t2, t0 -; RV32I-NEXT: sltu t1, a7, t1 -; RV32I-NEXT: srli a7, a7, 26 -; RV32I-NEXT: slli t4, a5, 6 -; RV32I-NEXT: or a7, t4, a7 -; RV32I-NEXT: sub a5, a5, a7 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a5, a5, t3 -; RV32I-NEXT: sub a7, t2, t0 -; RV32I-NEXT: sub a3, a3, a6 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: sltu t1, a6, t1 +; RV32I-NEXT: srli a6, a6, 26 +; RV32I-NEXT: slli t4, a4, 6 +; RV32I-NEXT: or a6, t4, a6 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a4, a4, t3 +; RV32I-NEXT: sub a6, t2, t0 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a7, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a6, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m63: diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ffbecc5074d3a..42f998e68bb6e 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -1074,7 +1074,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s4, a5 +; RV32-NEXT: mv s2, a5 ; RV32-NEXT: andi a5, a5, 1 ; RV32-NEXT: beqz a5, .LBB32_8 ; RV32-NEXT: # %bb.1: # %t @@ -1082,19 +1082,19 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: mv s3, a3 ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s5, a1 -; RV32-NEXT: mv s2, a0 +; RV32-NEXT: mv s4, a0 ; RV32-NEXT: beq a1, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t ; RV32-NEXT: sltu s6, s5, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s2, s1 +; RV32-NEXT: sltu s6, s4, s1 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call@plt ; RV32-NEXT: beqz s6, .LBB32_8 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s2, s1 +; RV32-NEXT: sltu a1, s4, s1 ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: beq s5, s3, .LBB32_7 ; RV32-NEXT: # %bb.6: # %end @@ -1102,12 +1102,12 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .LBB32_7: # %end ; RV32-NEXT: sub a2, s5, s3 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sub a1, s2, s1 +; RV32-NEXT: sub a1, s4, s1 ; RV32-NEXT: sw a1, 0(s0) ; RV32-NEXT: sw a2, 4(s0) ; RV32-NEXT: j .LBB32_9 ; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s4 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: .LBB32_9: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index af3828ed7d839..84e4062ca333d 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -1841,16 +1841,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -1875,7 +1875,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -1886,7 +1886,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -1912,13 +1912,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -1957,16 +1957,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -1991,7 +1991,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -2002,7 +2002,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2028,13 +2028,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2073,16 +2073,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2107,7 +2107,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a7, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2118,7 +2118,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw a7, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2144,13 +2144,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -2189,16 +2189,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2223,7 +2223,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a7, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2234,7 +2234,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw a7, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2260,13 +2260,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2317,16 +2317,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw t4, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t5, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a6, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2351,7 +2351,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2362,7 +2362,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2388,13 +2388,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t1, 132(sp) # 4-byte Folded Reload @@ -2457,16 +2457,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sd t4, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t5, 56(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t6, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a6, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -2491,7 +2491,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -2502,7 +2502,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -2528,13 +2528,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: ld ra, 264(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t0, 256(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t1, 248(sp) # 8-byte Folded Reload @@ -2574,16 +2574,16 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-LABEL: callee_no_irq: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -2608,7 +2608,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -2619,7 +2619,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -2645,28 +2645,28 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-LABEL: callee_no_irq: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -2691,7 +2691,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -2702,7 +2702,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2728,29 +2728,29 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32IZCMP-SR-LABEL: callee_no_irq: ; RV32IZCMP-SR: # %bb.0: ; RV32IZCMP-SR-NEXT: call t0, __riscv_save_12 ; RV32IZCMP-SR-NEXT: addi sp, sp, -32 -; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2775,7 +2775,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a7, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2786,7 +2786,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw a7, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2812,13 +2812,13 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: addi sp, sp, 32 ; RV32IZCMP-SR-NEXT: tail __riscv_restore_12 ; @@ -2826,16 +2826,16 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR: # %bb.0: ; RV64IZCMP-SR-NEXT: call t0, __riscv_save_12 ; RV64IZCMP-SR-NEXT: addi sp, sp, -48 -; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2860,7 +2860,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a7, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2871,7 +2871,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw a7, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2897,13 +2897,13 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: addi sp, sp, 48 ; RV64IZCMP-SR-NEXT: tail __riscv_restore_12 ; @@ -2923,16 +2923,16 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a6, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2957,7 +2957,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2968,7 +2968,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2994,13 +2994,13 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -3033,16 +3033,16 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a6, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -3067,7 +3067,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -3078,7 +3078,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -3104,13 +3104,13 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll index 0479cb223907f..71040bf2646d2 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll @@ -138,25 +138,25 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: rol_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: slli a5, a2, 26 -; CHECK-NEXT: srli a5, a5, 31 +; CHECK-NEXT: slli a3, a2, 26 +; CHECK-NEXT: srli a3, a3, 31 ; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: bnez a5, .LBB7_2 +; CHECK-NEXT: bnez a3, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: sll a3, a4, a2 -; CHECK-NEXT: bnez a5, .LBB7_4 +; CHECK-NEXT: sll a5, a4, a2 +; CHECK-NEXT: bnez a3, .LBB7_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: not a5, a2 -; CHECK-NEXT: srl a1, a1, a5 -; CHECK-NEXT: or a3, a3, a1 +; CHECK-NEXT: not a6, a2 +; CHECK-NEXT: srl a3, a1, a6 +; CHECK-NEXT: or a3, a5, a3 ; CHECK-NEXT: sll a0, a0, a2 ; CHECK-NEXT: srli a4, a4, 1 -; CHECK-NEXT: srl a1, a4, a5 +; CHECK-NEXT: srl a1, a4, a6 ; CHECK-NEXT: or a1, a0, a1 ; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: ret @@ -191,24 +191,24 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: ror_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a5, a2, 32 +; CHECK-NEXT: andi a4, a2, 32 ; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: beqz a5, .LBB9_2 +; CHECK-NEXT: beqz a4, .LBB9_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: srl a4, a3, a2 -; CHECK-NEXT: beqz a5, .LBB9_4 +; CHECK-NEXT: srl a5, a3, a2 +; CHECK-NEXT: beqz a4, .LBB9_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: .LBB9_4: ; CHECK-NEXT: slli a0, a1, 1 -; CHECK-NEXT: not a5, a2 -; CHECK-NEXT: sll a0, a0, a5 -; CHECK-NEXT: or a0, a0, a4 +; CHECK-NEXT: not a4, a2 +; CHECK-NEXT: sll a0, a0, a4 +; CHECK-NEXT: or a0, a0, a5 ; CHECK-NEXT: srl a1, a1, a2 ; CHECK-NEXT: slli a3, a3, 1 -; CHECK-NEXT: sll a2, a3, a5 +; CHECK-NEXT: sll a2, a3, a4 ; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: ret %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll index 2cdc07ef140ed..aa3d7b3fa8a7c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll @@ -1512,26 +1512,26 @@ define @bitreverse_nxv8i64( %va) { ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v0, v8, 24 ; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsll.vx v24, v8, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a3 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll index 25bee211fb2b5..690ecc6eab33b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll @@ -607,26 +607,26 @@ define @bswap_nxv8i64( %va) { ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v0, v8, 24 ; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsll.vx v24, v8, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a3 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index f68ac2212b527..7ce167f892973 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -737,14 +737,16 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -752,43 +754,49 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 50db8a7592c45..1c003a33c54bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2697,9 +2697,9 @@ define @vp_ctpop_nxv16i64( %va, @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 4b3cf6181514b..e53877f53833f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -2700,18 +2700,18 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -5841,18 +5841,18 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 48aa01a0e141e..55485beff8eb1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1916,18 +1916,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: vslidedown.vi v1, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 44(sp) @@ -1955,59 +1955,43 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a2, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 @@ -2016,31 +2000,28 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vadd.vv v16, v16, v8, v0.t ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: addi a2, sp, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -2049,82 +2030,50 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v24 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index bb30de41c4479..28df7f083c4a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -2182,16 +2182,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB34_2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB34_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -2216,7 +2216,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -2246,7 +2246,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -2272,20 +2272,20 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -2294,10 +2294,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -2313,10 +2313,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -2326,8 +2326,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2341,8 +2341,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2354,8 +2354,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2369,8 +2369,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2381,7 +2381,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -2408,24 +2408,24 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB34_2 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: li a1, 1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -2462,11 +2462,11 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4827,16 +4827,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB70_2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB70_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -4861,7 +4861,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -4891,7 +4891,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -4917,20 +4917,20 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -4939,10 +4939,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -4958,10 +4958,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -4971,8 +4971,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4986,8 +4986,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4999,8 +4999,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5014,8 +5014,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5026,7 +5026,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -5053,24 +5053,24 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB70_2 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB70_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: li a1, 1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -5107,11 +5107,11 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index 7a26bf2bfdf0c..84b3e142d5aea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll index 4e60edf058450..84ef9283802b9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -1031,72 +1031,80 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vslidedown.vi v16, v8, 1 ; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s a5, v24 -; RV32-NEXT: vmv.x.s a6, v16 +; RV32-NEXT: vmv.x.s s11, v24 +; RV32-NEXT: vmv.x.s ra, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s a3, v24 ; RV32-NEXT: vmv.x.s a4, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 3 ; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vmv.x.s a5, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 4 +; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vmv.x.s a6, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s2, v24 ; RV32-NEXT: vmv.x.s a7, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 4 +; RV32-NEXT: vslidedown.vi v16, v8, 6 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s3, v24 ; RV32-NEXT: vmv.x.s t0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vslidedown.vi v16, v8, 7 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s4, v24 ; RV32-NEXT: vmv.x.s t1, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 6 +; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s5, v24 ; RV32-NEXT: vmv.x.s t2, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 7 +; RV32-NEXT: vslidedown.vi v16, v8, 9 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s6, v24 ; RV32-NEXT: vmv.x.s t3, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 8 +; RV32-NEXT: vslidedown.vi v16, v8, 10 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s7, v24 ; RV32-NEXT: vmv.x.s t4, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 9 +; RV32-NEXT: vslidedown.vi v16, v8, 11 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s8, v24 ; RV32-NEXT: vmv.x.s t5, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 10 +; RV32-NEXT: vslidedown.vi v16, v8, 12 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s9, v24 ; RV32-NEXT: vmv.x.s t6, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 11 -; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s s10, v24 -; RV32-NEXT: vmv.x.s s0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 12 -; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s s11, v24 -; RV32-NEXT: vmv.x.s s1, v16 -; RV32-NEXT: vslidedown.vi v0, v8, 13 -; RV32-NEXT: vsrl.vx v16, v0, a1 -; RV32-NEXT: vmv.x.s ra, v16 +; RV32-NEXT: vslidedown.vi v24, v8, 13 +; RV32-NEXT: vsrl.vx v16, v24, a1 +; RV32-NEXT: vmv.x.s s10, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 14 -; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vsrl.vx v0, v16, a1 ; RV32-NEXT: vslidedown.vi v8, v8, 15 -; RV32-NEXT: vmv.x.s a2, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: vsrl.vx v24, v8, a1 ; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: add a5, a1, a5 -; RV32-NEXT: add a6, a0, a6 -; RV32-NEXT: sltu a0, a6, a0 -; RV32-NEXT: add a0, a5, a0 +; RV32-NEXT: add s11, a1, s11 +; RV32-NEXT: add ra, a0, ra +; RV32-NEXT: sltu a0, ra, a0 +; RV32-NEXT: add a0, s11, a0 ; RV32-NEXT: add a0, a0, a3 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: sltu a1, a4, a6 +; RV32-NEXT: add a4, ra, a4 +; RV32-NEXT: sltu a1, a4, ra +; RV32-NEXT: add a1, a1, s0 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a5, a4, a5 +; RV32-NEXT: sltu a1, a5, a4 +; RV32-NEXT: add a1, a1, s1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a6, a5, a6 +; RV32-NEXT: sltu a1, a6, a5 ; RV32-NEXT: add a1, a1, s2 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a7, a4, a7 -; RV32-NEXT: sltu a1, a7, a4 +; RV32-NEXT: add a7, a6, a7 +; RV32-NEXT: sltu a1, a7, a6 ; RV32-NEXT: add a1, a1, s3 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add t0, a7, t0 @@ -1127,21 +1135,13 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a1, a1, s10 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s0, t6, s0 -; RV32-NEXT: sltu a1, s0, t6 -; RV32-NEXT: add a1, a1, s11 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s1, s0, s1 -; RV32-NEXT: sltu a1, s1, s0 -; RV32-NEXT: add a1, a1, ra -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: add a2, s1, a2 -; RV32-NEXT: sltu a3, a2, s1 +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: add a2, t6, a2 +; RV32-NEXT: sltu a3, a2, t6 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: vmv.x.s a3, v16 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: vmv.x.s a1, v24 ; RV32-NEXT: add a3, a2, a3 ; RV32-NEXT: sltu a2, a3, a2 ; RV32-NEXT: add a1, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index ed8ebe83b89af..b3099f6b57056 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -141,7 +141,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 38 +; RV32-NEXT: li a5, 29 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -149,13 +149,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vid.v v10 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: slli a5, a4, 3 +; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs2r.v v10, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vadd.vi v8, v10, -4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 12 +; RV32-NEXT: li a5, 13 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -163,7 +164,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 18 +; RV32-NEXT: li a5, 21 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -173,15 +174,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a4, 12 ; RV32-NEXT: vmv.s.x v0, a4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 22 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 46 +; RV32-NEXT: li a5, 45 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -189,7 +189,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 26 +; RV32-NEXT: li a5, 25 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -211,14 +211,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 54 +; RV32-NEXT: li a4, 37 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -226,8 +226,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a6, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill @@ -242,28 +242,29 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 26 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 26 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vi v8, v10, -2 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -271,29 +272,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vadd.vi v8, v10, -8 -; RV32-NEXT: vmv2r.v v22, v10 +; RV32-NEXT: vmv2r.v v30, v10 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v20, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v8, v0.t -; RV32-NEXT: vmv.v.v v16, v12 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vmv.v.v v24, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 @@ -301,66 +301,66 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v8 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v4, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v22, -6 +; RV32-NEXT: vadd.vi v8, v30, -6 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vmv1r.v v2, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v16, v12 +; RV32-NEXT: vrgatherei16.vv v4, v16, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -369,77 +369,68 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v20, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v24, a1 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs1r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v1, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v20 -; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v8, v20 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v4, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.v v4, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -448,162 +439,161 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: vle16.v v20, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv.v.v v12, v8 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v4, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v14, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v14 +; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v16, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v16, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 1008 -; RV32-NEXT: vmv.s.x v16, a1 +; RV32-NEXT: vmv.s.x v28, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v0, v12 -; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v24 +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 18 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v16, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_14) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_14) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a2, %hi(.LCPI6_15) ; RV32-NEXT: addi a2, a2, %lo(.LCPI6_15) -; RV32-NEXT: vle16.v v0, (a1) -; RV32-NEXT: vle16.v v4, (a2) +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v8, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 54 +; RV32-NEXT: li a2, 45 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v0 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 38 +; RV32-NEXT: li a2, 37 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 29 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 30 +; RV32-NEXT: li a2, 53 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v4, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 45 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v12, v24 +; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v12, (a1) +; RV32-NEXT: vse32.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 @@ -611,21 +601,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a3, a2, 4 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 26 +; RV32-NEXT: li a2, 25 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -843,7 +834,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgatherei16.vv v8, v16, v28 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v28, v2, -13 +; RV64-NEXT: vadd.vi v16, v2, -13 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v4 ; RV64-NEXT: csrr a1, vlenb @@ -851,8 +842,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV64-NEXT: lui a1, 16 ; RV64-NEXT: addiw a1, a1, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 130d2c7613b32..112c0a0e598d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -6893,89 +6893,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi a4, a7, 1 +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi a4, a6, 1 ; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a4, 0(a2) ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 ; RV64ZVE32F-NEXT: .LBB57_2: ; RV64ZVE32F-NEXT: ld a5, 8(a3) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: bnez a6, .LBB57_11 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: bnez a7, .LBB57_11 ; RV64ZVE32F-NEXT: .LBB57_3: -; RV64ZVE32F-NEXT: ld a6, 16(a3) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a3) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 ; RV64ZVE32F-NEXT: .LBB57_4: ; RV64ZVE32F-NEXT: ld t0, 24(a3) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 ; RV64ZVE32F-NEXT: .LBB57_5: ; RV64ZVE32F-NEXT: ld t1, 32(a3) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 ; RV64ZVE32F-NEXT: .LBB57_6: ; RV64ZVE32F-NEXT: ld t2, 40(a3) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 ; RV64ZVE32F-NEXT: .LBB57_7: ; RV64ZVE32F-NEXT: ld t3, 48(a3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: bnez a7, .LBB57_16 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_16 ; RV64ZVE32F-NEXT: .LBB57_8: ; RV64ZVE32F-NEXT: ld a1, 56(a3) ; RV64ZVE32F-NEXT: j .LBB57_17 ; RV64ZVE32F-NEXT: .LBB57_9: ; RV64ZVE32F-NEXT: ld a4, 0(a3) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: slli a5, a5, 3 ; RV64ZVE32F-NEXT: add a5, a1, a5 ; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 -; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: slli a6, a6, 3 -; RV64ZVE32F-NEXT: add a6, a1, a6 -; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a2) +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t1, 32(a2) ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: beqz a7, .LBB57_8 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a2, 56(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -6984,7 +6984,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB57_17: # %else20 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: sd a5, 8(a0) -; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) ; RV64ZVE32F-NEXT: sd t2, 40(a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index f09485c2fcbe2..b3011d0f01cab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -401,41 +401,54 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 18 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: addi a3, a2, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a4, a3, 3 +; RV32-NEXT: add a3, a4, a3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vle64.v v24, (a2) +; RV32-NEXT: vle64.v v0, (a2) ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v1, v24, v8 +; RV32-NEXT: vmseq.vv v8, v0, v24 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, 128 -; RV32-NEXT: vle64.v v24, (a2) +; RV32-NEXT: vle64.v v8, (a2) ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a2, a0, 3 +; RV32-NEXT: add a0, a2, a0 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmseq.vv v0, v16, v8 +; RV32-NEXT: vmseq.vv v0, v16, v24 ; RV32-NEXT: addi a0, a1, 128 -; RV32-NEXT: vse64.v v24, (a0), v0.t -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vse64.v v8, (a0), v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vse64.v v8, (a1), v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 18 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index cba78368b2e7d..d9958f4aae350 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -595,7 +595,15 @@ declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v1, v0, 2 @@ -607,35 +615,43 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 6982a7cb5cb5e..3e0fb3009c6b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -543,54 +543,65 @@ declare <32 x double> @llvm.vp.rint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index 8e5f8cfe0570b..504982111d055 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 0cbde264378ff..35480164d4a12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 24481e4d77a01..4928eba52ac8c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll index aae047260163f..e558d45a3b2d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -634,7 +634,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 @@ -653,13 +653,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: and a2, a4, a2 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v2, v16, v8, v0.t ; CHECK-NEXT: bltu a3, a1, .LBB51_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB51_2: ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -669,7 +669,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v1 +; CHECK-NEXT: vmv1r.v v8, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll index 9212b4047d97b..12d96fbfb88d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll @@ -8,18 +8,18 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 40 +; CHECK-NEXT: li a4, 48 ; CHECK-NEXT: mul a2, a2, a4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 24 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: li a4, 40 +; CHECK-NEXT: mul a2, a2, a4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -29,70 +29,82 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi a4, a3, 384 ; CHECK-NEXT: vle8.v v8, (a4) ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: li a5, 24 +; CHECK-NEXT: mul a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: addi a4, a1, 128 ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a3, 256 ; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vle8.v v16, (a4) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v8, (a4) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle8.v v24, (a2) ; CHECK-NEXT: vle8.v v0, (a3) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v0, v8 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v8 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v16, v16, v8 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v0, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v24, v8, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v0, v8, v0 ; CHECK-NEXT: vse8.v v0, (a0) ; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vse8.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vse8.v v16, (a1) +; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: vse8.v v24, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 48 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 4da164c4fcaa8..2d348deb939ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -154,38 +154,48 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v2, v8 -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: addi a0, a1, 128 -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: addi a0, a3, -128 ; CHECK-NEXT: sltu a4, a3, a0 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: and a0, a4, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 ; CHECK-NEXT: bltu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 57e76a354a138..6c4f523aa8d94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -737,14 +737,16 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64( %va, @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32_mm: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NOV-NEXT: li a4, -1 -; CHECK-NOV-NEXT: srli a4, a4, 32 +; CHECK-NOV-NEXT: li a3, -1 +; CHECK-NOV-NEXT: srli a3, a3, 32 ; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB32_6 +; CHECK-NOV-NEXT: bge a1, a3, .LBB32_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB32_7 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a3, .LBB32_7 ; CHECK-NOV-NEXT: .LBB32_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB32_8 +; CHECK-NOV-NEXT: bge a4, a3, .LBB32_8 ; CHECK-NOV-NEXT: .LBB32_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB32_5 +; CHECK-NOV-NEXT: blt a5, a3, .LBB32_5 ; CHECK-NOV-NEXT: .LBB32_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB32_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 -; CHECK-NOV-NEXT: sgtz a5, a3 +; CHECK-NOV-NEXT: sgtz a3, a5 +; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: and a3, a3, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a3, a5, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 @@ -3643,20 +3643,20 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sw a1, 12(a0) ; CHECK-NOV-NEXT: sw a2, 8(a0) -; CHECK-NOV-NEXT: sw a3, 4(a0) -; CHECK-NOV-NEXT: sw a4, 0(a0) +; CHECK-NOV-NEXT: sw a4, 4(a0) +; CHECK-NOV-NEXT: sw a3, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB32_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB32_2 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a3, .LBB32_2 ; CHECK-NOV-NEXT: .LBB32_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB32_3 +; CHECK-NOV-NEXT: blt a4, a3, .LBB32_3 ; CHECK-NOV-NEXT: .LBB32_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB32_4 +; CHECK-NOV-NEXT: mv a4, a3 +; CHECK-NOV-NEXT: bge a5, a3, .LBB32_4 ; CHECK-NOV-NEXT: j .LBB32_5 ; ; CHECK-V-LABEL: ustest_f32i32_mm: @@ -4487,27 +4487,27 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16_mm: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NOV-NEXT: lui a4, 16 -; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: lui a3, 16 +; CHECK-NOV-NEXT: addiw a3, a3, -1 ; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB41_6 +; CHECK-NOV-NEXT: bge a1, a3, .LBB41_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB41_7 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a3, .LBB41_7 ; CHECK-NOV-NEXT: .LBB41_2: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB41_8 +; CHECK-NOV-NEXT: bge a4, a3, .LBB41_8 ; CHECK-NOV-NEXT: .LBB41_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB41_5 +; CHECK-NOV-NEXT: blt a5, a3, .LBB41_5 ; CHECK-NOV-NEXT: .LBB41_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB41_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 -; CHECK-NOV-NEXT: sgtz a5, a3 +; CHECK-NOV-NEXT: sgtz a3, a5 +; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: and a3, a3, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a3, a5, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 @@ -4516,20 +4516,20 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sh a1, 6(a0) ; CHECK-NOV-NEXT: sh a2, 4(a0) -; CHECK-NOV-NEXT: sh a3, 2(a0) -; CHECK-NOV-NEXT: sh a4, 0(a0) +; CHECK-NOV-NEXT: sh a4, 2(a0) +; CHECK-NOV-NEXT: sh a3, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB41_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB41_2 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a3, .LBB41_2 ; CHECK-NOV-NEXT: .LBB41_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB41_3 +; CHECK-NOV-NEXT: blt a4, a3, .LBB41_3 ; CHECK-NOV-NEXT: .LBB41_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB41_4 +; CHECK-NOV-NEXT: mv a4, a3 +; CHECK-NOV-NEXT: bge a5, a3, .LBB41_4 ; CHECK-NOV-NEXT: j .LBB41_5 ; ; CHECK-V-LABEL: ustest_f32i16_mm: diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index 0eb69c89f2c44..d79d28d52e73c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -1037,14 +1037,14 @@ define @fshr_v16i64( %a, @fshl_v16i64( %a, @llvm.vp.nearbyint.nxv16f64( @vp_nearbyint_nxv16f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1091,35 +1099,62 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: frflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v16, v0.t ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll index a731b40f0ead3..ebb186b197b41 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll @@ -992,14 +992,16 @@ define @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1099,43 +1101,49 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll index 54b326b0b6018..798c7e05bd47b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll @@ -1084,14 +1084,16 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1099,43 +1101,49 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 848f65ba3f614..40f5aeb57d176 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -2241,13 +2241,13 @@ define @fcmp_oeq_vv_nxv32f64( %va, @fcmp_oeq_vv_nxv32f64( %va, @fcmp_oeq_vv_nxv32f64( %va, @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vmv2r.v v20, v10 +; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: vmv2r.v v16, v10 ; RV32-NEXT: vmv2r.v v12, v8 -; RV32-NEXT: vrgather.vv v8, v12, v16 +; RV32-NEXT: vrgather.vv v8, v12, v20 ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vrsub.vi v12, v12, 15 ; RV32-NEXT: lui a0, 16 @@ -265,7 +265,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV32-NEXT: vrgather.vv v8, v20, v12, v0.t +; RV32-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: @@ -274,10 +274,10 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV64-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV64-NEXT: vle16.v v16, (a0) -; RV64-NEXT: vmv2r.v v20, v10 +; RV64-NEXT: vle16.v v20, (a0) +; RV64-NEXT: vmv2r.v v16, v10 ; RV64-NEXT: vmv2r.v v12, v8 -; RV64-NEXT: vrgather.vv v8, v12, v16 +; RV64-NEXT: vrgather.vv v8, v12, v20 ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vrsub.vi v12, v12, 15 ; RV64-NEXT: lui a0, 16 @@ -285,7 +285,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV64-NEXT: vrgather.vv v8, v20, v12, v0.t +; RV64-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %v32i16 @@ -369,18 +369,18 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: v8i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 +; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v18, v14, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vrsub.vi v12, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %v16i32 = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %v16i32 @@ -700,18 +700,18 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: v8f32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 +; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v18, v14, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vrsub.vi v12, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %v16f32 = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> ret <16 x float> %v16f32 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index c4caf94bec6c6..63a85b1f4dc74 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -1459,42 +1459,42 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB26_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfmul.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB26_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fmul.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB26_6 +; CHECK-NEXT: bnez a1, .LBB26_6 ; CHECK-NEXT: .LBB26_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1549,42 +1549,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB27_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB27_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fdiv.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB27_6 +; CHECK-NEXT: bnez a1, .LBB27_6 ; CHECK-NEXT: .LBB27_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1639,42 +1639,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB28_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB28_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fdiv.s fa5, fa0, fa5 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB28_6 +; CHECK-NEXT: bnez a1, .LBB28_6 ; CHECK-NEXT: .LBB28_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1729,42 +1729,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB29_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfadd.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB29_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fadd.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB29_6 +; CHECK-NEXT: bnez a1, .LBB29_6 ; CHECK-NEXT: .LBB29_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1819,42 +1819,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB30_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB30_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fsub.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB30_6 +; CHECK-NEXT: bnez a1, .LBB30_6 ; CHECK-NEXT: .LBB30_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1909,42 +1909,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB31_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB31_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fsub.s fa5, fa0, fa5 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB31_6 +; CHECK-NEXT: bnez a1, .LBB31_6 ; CHECK-NEXT: .LBB31_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2075,48 +2075,48 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: srli a4, a3, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a4, .LBB34_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a4, 1024 +; CHECK-NEXT: bgeu a4, a3, .LBB34_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: j .LBB34_5 ; CHECK-NEXT: .LBB34_2: # %vector.ph -; CHECK-NEXT: addiw a2, a4, -1 -; CHECK-NEXT: andi a5, a2, 1024 -; CHECK-NEXT: xori a2, a5, 1024 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a3 -; CHECK-NEXT: add a6, a6, a3 +; CHECK-NEXT: sub t0, t0, a3 +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB34_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB34_7 ; CHECK-NEXT: .LBB34_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a4, -1024 +; CHECK-NEXT: slli a4, a4, 2 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: .LBB34_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a1) ; CHECK-NEXT: fmadd.s fa5, fa5, fa0, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB34_6 +; CHECK-NEXT: bnez a2, .LBB34_6 ; CHECK-NEXT: .LBB34_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2175,48 +2175,48 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_commute_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: srli a4, a3, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a4, .LBB35_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a4, 1024 +; CHECK-NEXT: bgeu a4, a3, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: j .LBB35_5 ; CHECK-NEXT: .LBB35_2: # %vector.ph -; CHECK-NEXT: addiw a2, a4, -1 -; CHECK-NEXT: andi a5, a2, 1024 -; CHECK-NEXT: xori a2, a5, 1024 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a3 -; CHECK-NEXT: add a6, a6, a3 +; CHECK-NEXT: sub t0, t0, a3 +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB35_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB35_7 ; CHECK-NEXT: .LBB35_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a4, -1024 +; CHECK-NEXT: slli a4, a4, 2 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: .LBB35_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a1) ; CHECK-NEXT: fmadd.s fa5, fa0, fa5, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB35_6 +; CHECK-NEXT: bnez a2, .LBB35_6 ; CHECK-NEXT: .LBB35_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll index 44afb12124635..8df8a69e8027f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll @@ -671,19 +671,19 @@ define void @strided_store_nxv16f64( %v, ptr %ptr, i32 sig define void @strided_store_nxv16f64_allones_mask( %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) { ; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: mv a3, a2 -; CHECK-RV32-NEXT: bltu a2, a4, .LBB35_2 +; CHECK-RV32-NEXT: csrr a3, vlenb +; CHECK-RV32-NEXT: mv a4, a2 +; CHECK-RV32-NEXT: bltu a2, a3, .LBB35_2 ; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a3, a4 +; CHECK-RV32-NEXT: mv a4, a3 ; CHECK-RV32-NEXT: .LBB35_2: -; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1 -; CHECK-RV32-NEXT: sub a4, a2, a4 -; CHECK-RV32-NEXT: sltu a2, a2, a4 +; CHECK-RV32-NEXT: sub a3, a2, a3 +; CHECK-RV32-NEXT: sltu a2, a2, a3 ; CHECK-RV32-NEXT: addi a2, a2, -1 -; CHECK-RV32-NEXT: and a2, a2, a4 -; CHECK-RV32-NEXT: mul a3, a3, a1 +; CHECK-RV32-NEXT: and a2, a2, a3 +; CHECK-RV32-NEXT: mul a3, a4, a1 ; CHECK-RV32-NEXT: add a0, a0, a3 ; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1 @@ -691,19 +691,19 @@ define void @strided_store_nxv16f64_allones_mask( %v, ptr ; ; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask: ; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: mv a3, a2 -; CHECK-RV64-NEXT: bltu a2, a4, .LBB35_2 +; CHECK-RV64-NEXT: csrr a3, vlenb +; CHECK-RV64-NEXT: mv a4, a2 +; CHECK-RV64-NEXT: bltu a2, a3, .LBB35_2 ; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a3, a4 +; CHECK-RV64-NEXT: mv a4, a3 ; CHECK-RV64-NEXT: .LBB35_2: -; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1 -; CHECK-RV64-NEXT: sub a4, a2, a4 -; CHECK-RV64-NEXT: sltu a2, a2, a4 +; CHECK-RV64-NEXT: sub a3, a2, a3 +; CHECK-RV64-NEXT: sltu a2, a2, a3 ; CHECK-RV64-NEXT: addi a2, a2, -1 -; CHECK-RV64-NEXT: and a2, a2, a4 -; CHECK-RV64-NEXT: mul a3, a3, a1 +; CHECK-RV64-NEXT: and a2, a2, a3 +; CHECK-RV64-NEXT: mul a3, a4, a1 ; CHECK-RV64-NEXT: add a0, a0, a3 ; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index dde54d36d55a1..6d152cef124b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -104,74 +104,58 @@ define {, } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: vl8re64.v v24, (a1) +; CHECK-NEXT: vl8re64.v v8, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v16, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v0, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vv v0, v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v24, v16, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v24, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v16, 1 +; CHECK-NEXT: vrgather.vv v16, v0, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vrgather.vv v24, v0, v8 ; CHECK-NEXT: vmv4r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v28, v8 ; CHECK-NEXT: vmv4r.v v20, v0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index b6cb7f9f5ff10..b8cef5816687b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -8,18 +8,18 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v14, v8, 1, v0 -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vnsrl.wi v10, v12, 8 -; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vnsrl.wi v12, v8, 8 +; CHECK-NEXT: vmsne.vi v8, v12, 0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) ret {, } %retval @@ -165,70 +165,55 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v0, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v24, v0 -; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v16, v0, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v0, 1 +; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v0, v24, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v0 +; CHECK-NEXT: vrgather.vv v16, v24, v8 ; CHECK-NEXT: vmv4r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: vmv4r.v v20, v8 +; CHECK-NEXT: vmv4r.v v4, v24 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 @@ -371,70 +356,55 @@ define {, } @vector_deinterleave_nxv8f ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v0, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v24, v0 -; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v16, v0, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v0, 1 +; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v0, v24, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v0 +; CHECK-NEXT: vrgather.vv v16, v24, v8 ; CHECK-NEXT: vmv4r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: vmv4r.v v20, v8 +; CHECK-NEXT: vmv4r.v v4, v24 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index fa2b49e862fd5..c18602c98e6b8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1206,7 +1206,8 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfmadd_vf_nxv32f16( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @llvm.riscv.vmand.nxv1i1.i64(, This Inner Loop Header: Depth=1 -; CHECK-NEXT: slli a6, a4, 2 -; CHECK-NEXT: add a5, a0, a6 +; CHECK-NEXT: slli a3, a4, 2 +; CHECK-NEXT: add a5, a0, a3 ; CHECK-NEXT: vle32.v v8, (a5) ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 -; CHECK-NEXT: add a6, a6, a1 -; CHECK-NEXT: vse32.v v8, (a6), v0.t -; CHECK-NEXT: add a4, a4, a3 -; CHECK-NEXT: vsetvli a3, a2, e32, m1, ta, ma -; CHECK-NEXT: bnez a3, .LBB5_2 +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vse32.v v8, (a3), v0.t +; CHECK-NEXT: add a4, a4, a6 +; CHECK-NEXT: vsetvli a6, a2, e32, m1, ta, ma +; CHECK-NEXT: bnez a6, .LBB5_2 ; CHECK-NEXT: .LBB5_3: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 458ca3e473ef7..97121c275a294 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -492,23 +492,23 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: slli a1, a2, 25 ; RV32I-NEXT: srli a1, a1, 28 ; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a3, a3, a1 -; RV32I-NEXT: lbu a1, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: lbu a3, 5(a1) +; RV32I-NEXT: lbu a4, 4(a1) +; RV32I-NEXT: lbu a5, 6(a1) +; RV32I-NEXT: lbu a6, 7(a1) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: andi a2, a2, 7 -; RV32I-NEXT: sll a4, a1, a2 -; RV32I-NEXT: lbu a5, 1(a3) -; RV32I-NEXT: lbu a6, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t0, 3(a3) +; RV32I-NEXT: sll a4, a3, a2 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu t0, 3(a1) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 @@ -519,10 +519,10 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: xori a7, a2, 31 ; RV32I-NEXT: srl a6, a6, a7 ; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 9(a3) -; RV32I-NEXT: lbu t0, 8(a3) -; RV32I-NEXT: lbu t1, 10(a3) -; RV32I-NEXT: lbu t2, 11(a3) +; RV32I-NEXT: lbu a6, 9(a1) +; RV32I-NEXT: lbu t0, 8(a1) +; RV32I-NEXT: lbu t1, 10(a1) +; RV32I-NEXT: lbu t2, 11(a1) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: or a6, a6, t0 ; RV32I-NEXT: slli t1, t1, 16 @@ -530,28 +530,28 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a6, t0, a6 ; RV32I-NEXT: sll t0, a6, a2 -; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: srli a3, a3, 1 ; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: srl a1, a1, t1 -; RV32I-NEXT: or a1, t0, a1 -; RV32I-NEXT: lbu t0, 13(a3) -; RV32I-NEXT: lbu t1, 12(a3) -; RV32I-NEXT: lbu t2, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) +; RV32I-NEXT: srl a3, a3, t1 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: lbu t0, 13(a1) +; RV32I-NEXT: lbu t1, 12(a1) +; RV32I-NEXT: lbu t2, 14(a1) +; RV32I-NEXT: lbu a1, 15(a1) ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or t0, t0, t1 ; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t2 -; RV32I-NEXT: or a3, a3, t0 -; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: sll a1, a1, a2 ; RV32I-NEXT: srli a6, a6, 1 ; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: sll a2, a5, a2 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a3, 8(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret @@ -581,24 +581,24 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV32I-LABEL: fshr64_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a5, a2, 32 +; RV32I-NEXT: andi a4, a2, 32 ; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: beqz a5, .LBB9_2 +; RV32I-NEXT: beqz a4, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a4, a3, a2 -; RV32I-NEXT: beqz a5, .LBB9_4 +; RV32I-NEXT: srl a5, a3, a2 +; RV32I-NEXT: beqz a4, .LBB9_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: .LBB9_4: ; RV32I-NEXT: slli a0, a1, 1 -; RV32I-NEXT: not a5, a2 -; RV32I-NEXT: sll a0, a0, a5 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: not a4, a2 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a2, a3, a5 +; RV32I-NEXT: sll a2, a3, a4 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; @@ -617,56 +617,56 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw t1, 0(a1) +; RV32I-NEXT: lw t2, 0(a1) ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t0, 4(a1) +; RV32I-NEXT: lw a7, 4(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: andi t2, a2, 64 -; RV32I-NEXT: mv a7, t0 -; RV32I-NEXT: mv a4, t1 -; RV32I-NEXT: beqz t2, .LBB10_2 +; RV32I-NEXT: andi t1, a2, 64 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: mv a4, t2 +; RV32I-NEXT: beqz t1, .LBB10_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: mv t0, a1 ; RV32I-NEXT: mv a4, a3 ; RV32I-NEXT: .LBB10_2: ; RV32I-NEXT: andi a6, a2, 32 ; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: bnez a6, .LBB10_13 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: bnez t2, .LBB10_14 +; RV32I-NEXT: bnez t1, .LBB10_14 ; RV32I-NEXT: .LBB10_4: ; RV32I-NEXT: beqz a6, .LBB10_6 ; RV32I-NEXT: .LBB10_5: -; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: mv t0, a3 ; RV32I-NEXT: .LBB10_6: -; RV32I-NEXT: slli t3, a7, 1 -; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: beqz t2, .LBB10_8 +; RV32I-NEXT: slli t3, t0, 1 +; RV32I-NEXT: not t2, a2 +; RV32I-NEXT: beqz t1, .LBB10_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv a1, t0 +; RV32I-NEXT: mv a1, a7 ; RV32I-NEXT: .LBB10_8: -; RV32I-NEXT: srl t2, a5, a2 -; RV32I-NEXT: sll t3, t3, t1 -; RV32I-NEXT: srl t0, a7, a2 +; RV32I-NEXT: srl a7, a5, a2 +; RV32I-NEXT: sll t1, t3, t2 +; RV32I-NEXT: srl t0, t0, a2 ; RV32I-NEXT: beqz a6, .LBB10_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB10_10: -; RV32I-NEXT: or a7, t3, t2 -; RV32I-NEXT: slli t2, a3, 1 -; RV32I-NEXT: sll t2, t2, t1 -; RV32I-NEXT: or t0, t2, t0 +; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli t1, a3, 1 +; RV32I-NEXT: sll t1, t1, t2 +; RV32I-NEXT: or t0, t1, t0 ; RV32I-NEXT: srl a3, a3, a2 ; RV32I-NEXT: beqz a6, .LBB10_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: .LBB10_12: ; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sll a4, a4, t1 +; RV32I-NEXT: sll a4, a4, t2 ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: sll a2, a5, t1 +; RV32I-NEXT: sll a2, a5, t2 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: sw a3, 8(a0) @@ -674,33 +674,33 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-NEXT: sw a7, 0(a0) ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB10_13: -; RV32I-NEXT: mv a5, a7 -; RV32I-NEXT: beqz t2, .LBB10_4 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz t1, .LBB10_4 ; RV32I-NEXT: .LBB10_14: -; RV32I-NEXT: mv a3, t1 +; RV32I-NEXT: mv a3, t2 ; RV32I-NEXT: bnez a6, .LBB10_5 ; RV32I-NEXT: j .LBB10_6 ; ; RV64I-LABEL: fshr128_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: andi a5, a2, 64 +; RV64I-NEXT: andi a4, a2, 64 ; RV64I-NEXT: mv a3, a0 -; RV64I-NEXT: beqz a5, .LBB10_2 +; RV64I-NEXT: beqz a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a3, a1 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: srl a4, a3, a2 -; RV64I-NEXT: beqz a5, .LBB10_4 +; RV64I-NEXT: srl a5, a3, a2 +; RV64I-NEXT: beqz a4, .LBB10_4 ; RV64I-NEXT: # %bb.3: ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: .LBB10_4: ; RV64I-NEXT: slli a0, a1, 1 -; RV64I-NEXT: not a5, a2 -; RV64I-NEXT: sll a0, a0, a5 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: not a4, a2 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a1, a1, a2 ; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a2, a3, a5 +; RV64I-NEXT: sll a2, a3, a4 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: ret %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b) diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 231c066de5437..b5f1efa4b160b 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -1085,15 +1085,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) +; RV32I-NEXT: lw s1, 24(a1) +; RV32I-NEXT: lw s2, 28(a1) +; RV32I-NEXT: lw s3, 16(a1) +; RV32I-NEXT: lw s4, 20(a1) +; RV32I-NEXT: lw s5, 8(a1) +; RV32I-NEXT: lw s6, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -1101,33 +1101,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: mv a1, s6 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s5, a1 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: mv s6, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a1, s4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s4, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s4, 20(s0) +; RV32I-NEXT: sw s3, 16(s0) +; RV32I-NEXT: sw s6, 12(s0) +; RV32I-NEXT: sw s5, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1154,15 +1154,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) +; RV32IM-NEXT: lw s1, 24(a1) +; RV32IM-NEXT: lw s2, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s4, 20(a1) +; RV32IM-NEXT: lw s5, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -1170,33 +1170,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a0, s5 +; RV32IM-NEXT: mv a1, s6 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s5, a1 +; RV32IM-NEXT: mv s5, a0 +; RV32IM-NEXT: mv s6, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a0, s3 +; RV32IM-NEXT: mv a1, s4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 +; RV32IM-NEXT: mv s3, a0 +; RV32IM-NEXT: mv s4, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s0 -; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a0, s1 +; RV32IM-NEXT: mv a1, s2 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s4, 20(s0) +; RV32IM-NEXT: sw s3, 16(s0) +; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s5, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index ae78b3e9d6a2e..651df94bab496 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -45,7 +45,7 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 584 ; CHECK-NEXT: sw s6, 584(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw s9, 616(sp) +; CHECK-NEXT: lw s1, 616(sp) ; CHECK-NEXT: lw s2, 620(sp) ; CHECK-NEXT: lw s3, 624(sp) ; CHECK-NEXT: lw s4, 628(sp) @@ -59,14 +59,14 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 568 ; CHECK-NEXT: addi a1, sp, 552 ; CHECK-NEXT: addi a2, sp, 536 -; CHECK-NEXT: sw s9, 552(sp) +; CHECK-NEXT: sw s1, 552(sp) ; CHECK-NEXT: call __subtf3@plt ; CHECK-NEXT: lw a0, 568(sp) ; CHECK-NEXT: sw a0, 40(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 572(sp) -; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 576(sp) -; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 20(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 580(sp) ; CHECK-NEXT: sw a0, 48(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw zero, 500(sp) @@ -81,12 +81,11 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 488 ; CHECK-NEXT: sw s6, 504(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw s11, 520(sp) -; CHECK-NEXT: lw s10, 524(sp) +; CHECK-NEXT: lw s9, 520(sp) +; CHECK-NEXT: lw s11, 524(sp) ; CHECK-NEXT: lw s5, 528(sp) -; CHECK-NEXT: sw s5, 20(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s1, 532(sp) -; CHECK-NEXT: sw s1, 16(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s10, 532(sp) +; CHECK-NEXT: sw s10, 16(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(Y1) ; CHECK-NEXT: lw a1, %lo(Y1)(a0) ; CHECK-NEXT: sw a1, 52(sp) # 4-byte Folded Spill @@ -106,26 +105,27 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 328 ; CHECK-NEXT: addi a1, sp, 312 ; CHECK-NEXT: addi a2, sp, 296 -; CHECK-NEXT: sw s9, 312(sp) +; CHECK-NEXT: sw s1, 312(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 328(sp) ; CHECK-NEXT: sw a0, 44(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 332(sp) ; CHECK-NEXT: sw a0, 36(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 336(sp) -; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s9, 340(sp) +; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 340(sp) +; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw s0, 468(sp) ; CHECK-NEXT: sw s8, 464(sp) ; CHECK-NEXT: sw s7, 460(sp) ; CHECK-NEXT: sw s6, 456(sp) -; CHECK-NEXT: sw s1, 452(sp) +; CHECK-NEXT: sw s10, 452(sp) ; CHECK-NEXT: sw s5, 448(sp) -; CHECK-NEXT: sw s10, 444(sp) +; CHECK-NEXT: sw s11, 444(sp) ; CHECK-NEXT: addi a0, sp, 472 ; CHECK-NEXT: addi a1, sp, 456 ; CHECK-NEXT: addi a2, sp, 440 -; CHECK-NEXT: sw s11, 440(sp) +; CHECK-NEXT: sw s9, 440(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a3, 472(sp) ; CHECK-NEXT: lw a0, 476(sp) @@ -152,27 +152,27 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a2, %lo(X+8)(a4) ; CHECK-NEXT: sw a3, %lo(X+4)(a4) ; CHECK-NEXT: sw a0, %lo(X)(a4) -; CHECK-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s4, 212(sp) -; CHECK-NEXT: lw s3, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s3, 208(sp) -; CHECK-NEXT: lw s2, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s2, 204(sp) +; CHECK-NEXT: lw s8, 4(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s8, 212(sp) +; CHECK-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s4, 208(sp) +; CHECK-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s3, 204(sp) ; CHECK-NEXT: lw a0, 52(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 200(sp) ; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 228(sp) -; CHECK-NEXT: lw s1, 24(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s1, 224(sp) -; CHECK-NEXT: lw s0, 32(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s0, 220(sp) +; CHECK-NEXT: lw s10, 20(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s10, 224(sp) +; CHECK-NEXT: lw s2, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 220(sp) ; CHECK-NEXT: addi a0, sp, 232 ; CHECK-NEXT: addi a1, sp, 216 ; CHECK-NEXT: addi a2, sp, 200 -; CHECK-NEXT: lw s8, 40(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s8, 216(sp) +; CHECK-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s0, 216(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw s5, 232(sp) +; CHECK-NEXT: lw s1, 232(sp) ; CHECK-NEXT: lw a0, 236(sp) ; CHECK-NEXT: sw a0, 0(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw s6, 240(sp) @@ -183,13 +183,12 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw zero, 344(sp) ; CHECK-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 372(sp) -; CHECK-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 368(sp) -; CHECK-NEXT: sw s10, 364(sp) +; CHECK-NEXT: sw s5, 368(sp) +; CHECK-NEXT: sw s11, 364(sp) ; CHECK-NEXT: addi a0, sp, 376 ; CHECK-NEXT: addi a1, sp, 360 ; CHECK-NEXT: addi a2, sp, 344 -; CHECK-NEXT: sw s11, 360(sp) +; CHECK-NEXT: sw s9, 360(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 376(sp) ; CHECK-NEXT: lw a1, 388(sp) @@ -202,11 +201,12 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a0, %lo(S)(a4) ; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 260(sp) -; CHECK-NEXT: sw s1, 256(sp) -; CHECK-NEXT: sw s0, 252(sp) -; CHECK-NEXT: sw s8, 248(sp) -; CHECK-NEXT: sw s9, 276(sp) -; CHECK-NEXT: lw a0, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s10, 256(sp) +; CHECK-NEXT: sw s2, 252(sp) +; CHECK-NEXT: sw s0, 248(sp) +; CHECK-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 276(sp) +; CHECK-NEXT: lw a0, 32(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 272(sp) ; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 268(sp) @@ -236,7 +236,7 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 184 ; CHECK-NEXT: addi a1, sp, 168 ; CHECK-NEXT: addi a2, sp, 152 -; CHECK-NEXT: sw s5, 168(sp) +; CHECK-NEXT: sw s1, 168(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a0, 184(sp) ; CHECK-NEXT: lw a1, 196(sp) @@ -251,9 +251,9 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw zero, 112(sp) ; CHECK-NEXT: sw zero, 108(sp) ; CHECK-NEXT: sw zero, 104(sp) -; CHECK-NEXT: sw s4, 132(sp) -; CHECK-NEXT: sw s3, 128(sp) -; CHECK-NEXT: sw s2, 124(sp) +; CHECK-NEXT: sw s8, 132(sp) +; CHECK-NEXT: sw s4, 128(sp) +; CHECK-NEXT: sw s3, 124(sp) ; CHECK-NEXT: addi a0, sp, 136 ; CHECK-NEXT: addi a1, sp, 120 ; CHECK-NEXT: addi a2, sp, 104 diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll index d57c62170699a..8c0d97afe6c21 100644 --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -15,34 +15,34 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: lw a6, 8(a1) ; RISCV32-NEXT: lw a4, 0(a2) ; RISCV32-NEXT: lw a5, 0(a1) -; RISCV32-NEXT: lw t3, 4(a1) +; RISCV32-NEXT: lw t2, 4(a1) ; RISCV32-NEXT: lw t0, 8(a2) ; RISCV32-NEXT: lw a2, 4(a2) ; RISCV32-NEXT: mulhu a1, a5, a4 -; RISCV32-NEXT: mul t1, t3, a4 +; RISCV32-NEXT: mul t1, t2, a4 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t2, t3, a4 -; RISCV32-NEXT: add t4, t2, t1 +; RISCV32-NEXT: mulhu t3, t2, a4 +; RISCV32-NEXT: add t4, t3, t1 ; RISCV32-NEXT: mul t1, a5, a2 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t2, a5, a2 -; RISCV32-NEXT: add t1, t2, t1 +; RISCV32-NEXT: mulhu t3, a5, a2 +; RISCV32-NEXT: add t1, t3, t1 ; RISCV32-NEXT: add t5, t4, t1 -; RISCV32-NEXT: mul t6, t3, a2 +; RISCV32-NEXT: mul t6, t2, a2 ; RISCV32-NEXT: add s0, t6, t5 ; RISCV32-NEXT: mul t1, t0, a5 ; RISCV32-NEXT: mul s3, a6, a4 ; RISCV32-NEXT: add s4, s3, t1 ; RISCV32-NEXT: add t1, s0, s4 -; RISCV32-NEXT: sltu t2, t1, s0 +; RISCV32-NEXT: sltu t3, t1, s0 ; RISCV32-NEXT: sltu s0, s0, t6 ; RISCV32-NEXT: sltu t4, t5, t4 -; RISCV32-NEXT: mulhu t5, t3, a2 +; RISCV32-NEXT: mulhu t5, t2, a2 ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: add s0, t4, s0 -; RISCV32-NEXT: mul t4, t3, t0 +; RISCV32-NEXT: mul t4, t2, t0 ; RISCV32-NEXT: mul t5, a7, a5 ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: mulhu s1, t0, a5 @@ -56,22 +56,22 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: sltu s3, s4, s3 ; RISCV32-NEXT: add t4, t4, s3 ; RISCV32-NEXT: add t4, s0, t4 -; RISCV32-NEXT: add t4, t4, t2 +; RISCV32-NEXT: add t4, t4, t3 ; RISCV32-NEXT: beq t4, s0, .LBB0_2 ; RISCV32-NEXT: # %bb.1: # %start -; RISCV32-NEXT: sltu t2, t4, s0 +; RISCV32-NEXT: sltu t3, t4, s0 ; RISCV32-NEXT: .LBB0_2: # %start ; RISCV32-NEXT: sltu s0, s2, s1 -; RISCV32-NEXT: snez s1, t3 +; RISCV32-NEXT: snez s1, t2 ; RISCV32-NEXT: snez s2, a7 ; RISCV32-NEXT: and s1, s2, s1 ; RISCV32-NEXT: mulhu s2, a7, a5 ; RISCV32-NEXT: snez s2, s2 ; RISCV32-NEXT: or s1, s1, s2 -; RISCV32-NEXT: mulhu t3, t3, t0 -; RISCV32-NEXT: snez t3, t3 -; RISCV32-NEXT: or t3, s1, t3 -; RISCV32-NEXT: or t3, t3, s0 +; RISCV32-NEXT: mulhu t2, t2, t0 +; RISCV32-NEXT: snez t2, t2 +; RISCV32-NEXT: or t2, s1, t2 +; RISCV32-NEXT: or t2, t2, s0 ; RISCV32-NEXT: sltu t5, t6, t5 ; RISCV32-NEXT: snez t6, a2 ; RISCV32-NEXT: snez s0, a3 @@ -89,7 +89,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: snez a3, a3 ; RISCV32-NEXT: and a3, a3, a7 ; RISCV32-NEXT: or a2, a3, a2 -; RISCV32-NEXT: or a3, t3, t2 +; RISCV32-NEXT: or a3, t2, t3 ; RISCV32-NEXT: or a2, a2, a3 ; RISCV32-NEXT: mul a3, a5, a4 ; RISCV32-NEXT: andi a2, a2, 1 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index d8f364ec8c00f..a38ae17f19df3 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -791,15 +791,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) +; RV32I-NEXT: lw s1, 24(a1) +; RV32I-NEXT: lw s2, 28(a1) +; RV32I-NEXT: lw s3, 16(a1) +; RV32I-NEXT: lw s4, 20(a1) +; RV32I-NEXT: lw s5, 8(a1) +; RV32I-NEXT: lw s6, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -807,33 +807,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: mv a1, s6 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s5, a1 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: mv s6, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a1, s4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s4, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s4, 20(s0) +; RV32I-NEXT: sw s3, 16(s0) +; RV32I-NEXT: sw s6, 12(s0) +; RV32I-NEXT: sw s5, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -860,15 +860,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) +; RV32IM-NEXT: lw s1, 24(a1) +; RV32IM-NEXT: lw s2, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s4, 20(a1) +; RV32IM-NEXT: lw s5, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -876,33 +876,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a0, s5 +; RV32IM-NEXT: mv a1, s6 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s5, a1 +; RV32IM-NEXT: mv s5, a0 +; RV32IM-NEXT: mv s6, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a0, s3 +; RV32IM-NEXT: mv a1, s4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 +; RV32IM-NEXT: mv s3, a0 +; RV32IM-NEXT: mv s4, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s0 -; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a0, s1 +; RV32IM-NEXT: mv a1, s2 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s4, 20(s0) +; RV32IM-NEXT: sw s3, 16(s0) +; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s5, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 3550ac8de9648..b0d435368e92b 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1397,50 +1397,50 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 56(sp) ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 56 -; RV64I-NEXT: add a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: add a6, a0, a1 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -1615,50 +1615,50 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 28(sp) ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 28 -; RV32I-NEXT: add a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: add a6, a0, a1 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -1840,50 +1840,50 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 88(sp) ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 88 -; RV64I-NEXT: sub a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: sub a6, a0, a1 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -2058,50 +2058,50 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 60 -; RV32I-NEXT: sub a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: sub a6, a0, a1 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -2172,8 +2172,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv t1, a1 -; RV64I-NEXT: lbu t0, 31(a0) +; RV64I-NEXT: mv t0, a1 +; RV64I-NEXT: lbu t1, 31(a0) ; RV64I-NEXT: lbu a1, 0(a0) ; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a1, 1(a0) @@ -2211,15 +2211,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a1, 30(a0) ; RV64I-NEXT: lbu a3, 29(a0) ; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: lbu t1, 0(t1) +; RV64I-NEXT: lbu t0, 0(t0) ; RV64I-NEXT: sb a1, 86(sp) ; RV64I-NEXT: sb a3, 85(sp) ; RV64I-NEXT: sb a0, 84(sp) ; RV64I-NEXT: sb a4, 83(sp) ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb t0, 87(sp) -; RV64I-NEXT: slli t0, t0, 56 +; RV64I-NEXT: sb t1, 87(sp) +; RV64I-NEXT: slli t1, t1, 56 ; RV64I-NEXT: sb a7, 80(sp) ; RV64I-NEXT: sb ra, 79(sp) ; RV64I-NEXT: sb s11, 78(sp) @@ -2251,7 +2251,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t0, 63 +; RV64I-NEXT: srai a0, t1, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2291,52 +2291,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: andi a0, t1, 31 +; RV64I-NEXT: andi a0, t0, 31 ; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a5, a1, a0 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: add a6, a1, a0 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -2400,8 +2400,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv t1, a1 -; RV32I-NEXT: lbu t0, 31(a0) +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: lbu t1, 31(a0) ; RV32I-NEXT: lbu a1, 0(a0) ; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a1, 1(a0) @@ -2439,15 +2439,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a1, 30(a0) ; RV32I-NEXT: lbu a3, 29(a0) ; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: lbu t1, 0(t1) +; RV32I-NEXT: lbu t0, 0(t0) ; RV32I-NEXT: sb a1, 58(sp) ; RV32I-NEXT: sb a3, 57(sp) ; RV32I-NEXT: sb a0, 56(sp) ; RV32I-NEXT: sb a4, 55(sp) ; RV32I-NEXT: sb a5, 54(sp) ; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb t0, 59(sp) -; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: sb t1, 59(sp) +; RV32I-NEXT: slli t1, t1, 24 ; RV32I-NEXT: sb a7, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) @@ -2479,7 +2479,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 29(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t0, 31 +; RV32I-NEXT: srai a0, t1, 31 ; RV32I-NEXT: sb a0, 88(sp) ; RV32I-NEXT: sb a0, 84(sp) ; RV32I-NEXT: sb a0, 80(sp) @@ -2515,52 +2515,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 63(sp) ; RV32I-NEXT: sb a3, 62(sp) ; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: andi a0, t1, 31 +; RV32I-NEXT: andi a0, t0, 31 ; RV32I-NEXT: addi a1, sp, 28 -; RV32I-NEXT: add a5, a1, a0 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: add a6, a1, a0 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index ace54fa6bf03d..a601256bc2afa 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -781,9 +781,9 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a5, a4 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -795,7 +795,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -808,12 +808,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: srl a7, a7, a4 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: xori t0, a4, 31 +; RV32I-NEXT: sll a5, a5, t0 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: srl a6, a6, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -827,19 +827,19 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a5 +; RV32I-NEXT: srl a3, a3, a4 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -852,8 +852,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1064,9 +1064,9 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: sll a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: sll a0, a5, a4 ; RV32I-NEXT: lbu a1, 1(a3) ; RV32I-NEXT: lbu a6, 0(a3) ; RV32I-NEXT: lbu a7, 2(a3) @@ -1078,7 +1078,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: xori a7, a5, 31 +; RV32I-NEXT: xori a7, a4, 31 ; RV32I-NEXT: srl a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu t0, 13(a3) @@ -1091,7 +1091,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: or t1, t3, t2 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a5 +; RV32I-NEXT: sll t0, t0, a4 ; RV32I-NEXT: lbu t1, 9(a3) ; RV32I-NEXT: lbu t2, 8(a3) ; RV32I-NEXT: lbu t3, 10(a3) @@ -1105,13 +1105,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t1, a3, 1 ; RV32I-NEXT: srl a7, t1, a7 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: sll a3, a3, a5 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: not t1, a5 -; RV32I-NEXT: srl a4, a4, t1 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: sll a5, a6, a5 -; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: sll a3, a3, a4 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: not t1, a4 +; RV32I-NEXT: srl a5, a5, t1 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: sll a4, a6, a4 +; RV32I-NEXT: sb a4, 0(a2) ; RV32I-NEXT: srli a6, a3, 16 ; RV32I-NEXT: sb a6, 10(a2) ; RV32I-NEXT: srli a6, a3, 24 @@ -1124,19 +1124,19 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a3, t0, 8 ; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, a4, 16 ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: srli a3, a4, 24 ; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) ; RV32I-NEXT: srli a3, a0, 16 ; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: sb a3, 7(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb a5, 8(a2) ; RV32I-NEXT: sb a7, 12(a2) ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1353,9 +1353,9 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a5, a4 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -1367,7 +1367,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -1380,12 +1380,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: srl a7, a7, a4 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: xori t0, a4, 31 +; RV32I-NEXT: sll a5, a5, t0 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: srl a6, a6, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -1399,19 +1399,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a5 +; RV32I-NEXT: sra a3, a3, a4 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -1424,8 +1424,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1497,13 +1497,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 4(a1) +; RV64I-NEXT: or s11, s11, s9 +; RV64I-NEXT: lbu s9, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 21(a0) +; RV64I-NEXT: or s10, s10, s9 +; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -1511,8 +1511,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s9 -; RV64I-NEXT: lbu s9, 24(a0) +; RV64I-NEXT: or t0, a1, s11 +; RV64I-NEXT: lbu s11, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -1527,10 +1527,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 83(sp) ; RV64I-NEXT: sb a6, 82(sp) ; RV64I-NEXT: sb a7, 81(sp) -; RV64I-NEXT: sb s9, 80(sp) +; RV64I-NEXT: sb s11, 80(sp) ; RV64I-NEXT: sb s10, 79(sp) ; RV64I-NEXT: sb ra, 78(sp) -; RV64I-NEXT: sb s11, 77(sp) +; RV64I-NEXT: sb s9, 77(sp) ; RV64I-NEXT: sb s8, 76(sp) ; RV64I-NEXT: sb s7, 75(sp) ; RV64I-NEXT: sb s6, 74(sp) @@ -1688,24 +1688,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 ; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: or a7, a3, a7 +; RV64I-NEXT: slli a3, a7, 1 +; RV64I-NEXT: sll t0, a3, t0 +; RV64I-NEXT: srl a3, a4, a1 +; RV64I-NEXT: srl a4, a6, a1 ; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: srl a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 +; RV64I-NEXT: srl a1, a7, a1 +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 21(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 19(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: or a6, a5, t0 ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) @@ -1724,35 +1724,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 24(a2) ; RV64I-NEXT: srli a1, a1, 8 ; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a6, 48 +; RV64I-NEXT: srli a1, a4, 48 ; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a6, 40 +; RV64I-NEXT: srli a1, a4, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a6, 32 +; RV64I-NEXT: srli a1, a4, 32 ; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a6, 24 +; RV64I-NEXT: srli a1, a4, 24 ; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a6, 16 +; RV64I-NEXT: srli a1, a4, 16 ; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a6, t1 -; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: or a1, a4, t1 +; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: srli a3, a3, 56 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a4, a3, 48 +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: srli a4, a3, 40 +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: srli a4, a3, 32 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: srli a4, a3, 24 +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: srli a4, a3, 16 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: srli a3, a6, 56 ; RV64I-NEXT: sb a3, 23(a2) ; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: sb a1, 7(a2) @@ -1816,21 +1816,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 1(a1) -; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s10, 1(a1) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s10, s10, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: or s10, s10, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s8 -; RV32I-NEXT: lbu s8, 24(a0) +; RV32I-NEXT: or t0, a1, s10 +; RV32I-NEXT: lbu s10, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -1845,11 +1845,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 55(sp) ; RV32I-NEXT: sb a6, 54(sp) ; RV32I-NEXT: sb a7, 53(sp) -; RV32I-NEXT: sb s8, 52(sp) +; RV32I-NEXT: sb s10, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s10, 49(sp) -; RV32I-NEXT: sb s9, 48(sp) +; RV32I-NEXT: sb s9, 49(sp) +; RV32I-NEXT: sb s8, 48(sp) ; RV32I-NEXT: sb s7, 47(sp) ; RV32I-NEXT: sb s6, 46(sp) ; RV32I-NEXT: sb s5, 45(sp) @@ -1921,7 +1921,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a5, a5, 24 ; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t4, a3, a0 +; RV32I-NEXT: or t5, a3, a0 ; RV32I-NEXT: andi a3, t0, 7 ; RV32I-NEXT: lbu a0, 9(a4) ; RV32I-NEXT: lbu a1, 8(a4) @@ -1934,69 +1934,69 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a3 -; RV32I-NEXT: sll a0, a0, t0 +; RV32I-NEXT: not t1, a3 +; RV32I-NEXT: sll a0, a0, t1 ; RV32I-NEXT: lbu a1, 1(a4) ; RV32I-NEXT: lbu a5, 0(a4) ; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t1, 3(a4) +; RV32I-NEXT: lbu t0, 3(a4) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t5, 1 ; RV32I-NEXT: xori t2, a3, 31 ; RV32I-NEXT: sll a1, a1, t2 ; RV32I-NEXT: lbu a5, 13(a4) ; RV32I-NEXT: lbu a7, 12(a4) ; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t5, 15(a4) +; RV32I-NEXT: lbu t4, 15(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t3, a7, a5 ; RV32I-NEXT: lbu a5, 17(a4) ; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t5, 18(a4) +; RV32I-NEXT: lbu t4, 18(a4) ; RV32I-NEXT: lbu t6, 19(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: lbu t5, 21(a4) +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: or t4, a7, a5 +; RV32I-NEXT: slli a5, t4, 1 +; RV32I-NEXT: sll a7, a5, t1 +; RV32I-NEXT: lbu a5, 21(a4) ; RV32I-NEXT: lbu t6, 20(a4) ; RV32I-NEXT: lbu s0, 22(a4) ; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 25(a4) -; RV32I-NEXT: lbu s0, 24(a4) +; RV32I-NEXT: or s0, s0, a5 +; RV32I-NEXT: lbu a5, 25(a4) +; RV32I-NEXT: lbu t6, 24(a4) ; RV32I-NEXT: lbu s1, 26(a4) ; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: lbu s0, 29(a4) +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: or t6, t6, a5 +; RV32I-NEXT: lbu a5, 29(a4) ; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 +; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, s1 ; RV32I-NEXT: lbu s1, 30(a4) ; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 @@ -2004,76 +2004,76 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a4, a4, 24 ; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: slli s1, s0, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or a4, a4, s0 -; RV32I-NEXT: slli s0, a4, 1 -; RV32I-NEXT: sll t2, s0, t2 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t1, t1, a3 -; RV32I-NEXT: srl t3, t3, a3 +; RV32I-NEXT: or s3, a4, a5 +; RV32I-NEXT: slli a4, s3, 1 +; RV32I-NEXT: sll t2, a4, t2 +; RV32I-NEXT: srl a4, t5, a3 +; RV32I-NEXT: srl a5, t0, a3 +; RV32I-NEXT: srl t0, t3, a3 ; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t5, t5, a3 -; RV32I-NEXT: srl a5, a5, a3 -; RV32I-NEXT: srl t6, t6, a3 -; RV32I-NEXT: srl a3, a4, a3 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t4, t4, a3 +; RV32I-NEXT: srl t5, t6, a3 +; RV32I-NEXT: srl a3, s3, a3 +; RV32I-NEXT: srli t6, t5, 16 +; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: or t2, t5, t2 +; RV32I-NEXT: sb t5, 24(a2) +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: sb t5, 25(a2) +; RV32I-NEXT: srli t5, a3, 24 +; RV32I-NEXT: sb t5, 31(a2) +; RV32I-NEXT: srli t5, a3, 16 +; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, t4, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or s1, a5, s1 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 +; RV32I-NEXT: or a3, t4, s1 +; RV32I-NEXT: sb t4, 16(a2) +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb t4, 17(a2) +; RV32I-NEXT: srli t4, t3, 16 +; RV32I-NEXT: sb t4, 22(a2) +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: srli a7, t3, 8 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: srli a7, t0, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 +; RV32I-NEXT: srli a7, a5, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) -; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: srli a4, t2, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: sb s1, 19(a2) ; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a1, a1, 24 @@ -2155,13 +2155,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 4(a1) +; RV64I-NEXT: or s11, s11, s9 +; RV64I-NEXT: lbu s9, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 21(a0) +; RV64I-NEXT: or s10, s10, s9 +; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -2169,8 +2169,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s9 -; RV64I-NEXT: lbu s9, 24(a0) +; RV64I-NEXT: or t0, a1, s11 +; RV64I-NEXT: lbu s11, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -2185,10 +2185,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 115(sp) ; RV64I-NEXT: sb a6, 114(sp) ; RV64I-NEXT: sb a7, 113(sp) -; RV64I-NEXT: sb s9, 112(sp) +; RV64I-NEXT: sb s11, 112(sp) ; RV64I-NEXT: sb s10, 111(sp) ; RV64I-NEXT: sb ra, 110(sp) -; RV64I-NEXT: sb s11, 109(sp) +; RV64I-NEXT: sb s9, 109(sp) ; RV64I-NEXT: sb s8, 108(sp) ; RV64I-NEXT: sb s7, 107(sp) ; RV64I-NEXT: sb s6, 106(sp) @@ -2474,21 +2474,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 1(a1) -; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s10, 1(a1) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s10, s10, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: or s10, s10, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s8 -; RV32I-NEXT: lbu s8, 24(a0) +; RV32I-NEXT: or t0, a1, s10 +; RV32I-NEXT: lbu s10, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -2503,11 +2503,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 87(sp) ; RV32I-NEXT: sb a6, 86(sp) ; RV32I-NEXT: sb a7, 85(sp) -; RV32I-NEXT: sb s8, 84(sp) +; RV32I-NEXT: sb s10, 84(sp) ; RV32I-NEXT: sb ra, 83(sp) ; RV32I-NEXT: sb s11, 82(sp) -; RV32I-NEXT: sb s10, 81(sp) -; RV32I-NEXT: sb s9, 80(sp) +; RV32I-NEXT: sb s9, 81(sp) +; RV32I-NEXT: sb s8, 80(sp) ; RV32I-NEXT: sb s7, 79(sp) ; RV32I-NEXT: sb s6, 78(sp) ; RV32I-NEXT: sb s5, 77(sp) @@ -2568,125 +2568,125 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: slli a0, t0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a5, sp, 60 -; RV32I-NEXT: sub a5, a5, a0 -; RV32I-NEXT: lbu a0, 5(a5) -; RV32I-NEXT: lbu a1, 4(a5) -; RV32I-NEXT: lbu a3, 6(a5) -; RV32I-NEXT: lbu a4, 7(a5) +; RV32I-NEXT: addi a4, sp, 60 +; RV32I-NEXT: sub a4, a4, a0 +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a3, 6(a4) +; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or t3, a3, a0 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t5, a3, a0 ; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a5) -; RV32I-NEXT: lbu a3, 0(a5) -; RV32I-NEXT: lbu a4, 2(a5) -; RV32I-NEXT: lbu a6, 3(a5) +; RV32I-NEXT: lbu a0, 1(a4) +; RV32I-NEXT: lbu a3, 0(a4) +; RV32I-NEXT: lbu a5, 2(a4) +; RV32I-NEXT: lbu a6, 3(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a4 +; RV32I-NEXT: or a3, a6, a5 ; RV32I-NEXT: or a6, a3, a0 ; RV32I-NEXT: srli a0, a6, 1 ; RV32I-NEXT: xori a7, a1, 31 ; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a5) -; RV32I-NEXT: lbu a4, 12(a5) -; RV32I-NEXT: lbu t0, 14(a5) -; RV32I-NEXT: lbu t1, 15(a5) +; RV32I-NEXT: lbu a3, 13(a4) +; RV32I-NEXT: lbu a5, 12(a4) +; RV32I-NEXT: lbu t0, 14(a4) +; RV32I-NEXT: lbu t1, 15(a4) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or t0, a4, a3 -; RV32I-NEXT: lbu a3, 9(a5) -; RV32I-NEXT: lbu a4, 8(a5) -; RV32I-NEXT: lbu t1, 10(a5) -; RV32I-NEXT: lbu t2, 11(a5) +; RV32I-NEXT: or a5, t1, t0 +; RV32I-NEXT: or t0, a5, a3 +; RV32I-NEXT: lbu a3, 9(a4) +; RV32I-NEXT: lbu a5, 8(a4) +; RV32I-NEXT: lbu t1, 10(a4) +; RV32I-NEXT: lbu t2, 11(a4) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: or t1, a4, a3 +; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: or t1, a5, a3 ; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a3, a3, a7 -; RV32I-NEXT: srli a4, t3, 1 +; RV32I-NEXT: srl a5, a3, a7 +; RV32I-NEXT: srli t4, t5, 1 ; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu t4, 21(a5) -; RV32I-NEXT: lbu t5, 20(a5) -; RV32I-NEXT: lbu t6, 22(a5) -; RV32I-NEXT: lbu s0, 23(a5) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t4, t4, t5 +; RV32I-NEXT: lbu a3, 21(a4) +; RV32I-NEXT: lbu t3, 20(a4) +; RV32I-NEXT: lbu t6, 22(a4) +; RV32I-NEXT: lbu s0, 23(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: lbu t5, 17(a5) -; RV32I-NEXT: lbu t6, 16(a5) -; RV32I-NEXT: lbu s0, 18(a5) -; RV32I-NEXT: lbu s1, 19(a5) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: or t3, s0, t6 +; RV32I-NEXT: or t3, t3, a3 +; RV32I-NEXT: lbu a3, 17(a4) +; RV32I-NEXT: lbu t6, 16(a4) +; RV32I-NEXT: lbu s0, 18(a4) +; RV32I-NEXT: lbu s1, 19(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 29(a5) -; RV32I-NEXT: lbu s0, 28(a5) -; RV32I-NEXT: lbu s1, 30(a5) -; RV32I-NEXT: lbu s2, 31(a5) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: or s0, s0, a3 +; RV32I-NEXT: lbu a3, 29(a4) +; RV32I-NEXT: lbu t6, 28(a4) +; RV32I-NEXT: lbu s1, 30(a4) +; RV32I-NEXT: lbu s2, 31(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 25(a5) -; RV32I-NEXT: lbu s2, 24(a5) -; RV32I-NEXT: srl a4, a4, t2 -; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: lbu s1, 25(a4) +; RV32I-NEXT: lbu s2, 24(a4) +; RV32I-NEXT: srl t4, t4, t2 +; RV32I-NEXT: or t6, t6, a3 ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or s0, s1, s2 -; RV32I-NEXT: lbu s1, 26(a5) -; RV32I-NEXT: lbu a5, 27(a5) -; RV32I-NEXT: srli s2, t5, 1 +; RV32I-NEXT: or a3, s1, s2 +; RV32I-NEXT: lbu s1, 26(a4) +; RV32I-NEXT: lbu a4, 27(a4) +; RV32I-NEXT: srli s2, s0, 1 ; RV32I-NEXT: srl s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 ; RV32I-NEXT: srli s1, t0, 1 ; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a5, a5, s0 -; RV32I-NEXT: srli s0, a5, 1 -; RV32I-NEXT: srl a7, s0, a7 -; RV32I-NEXT: srli s0, t4, 1 -; RV32I-NEXT: srl t2, s0, t2 -; RV32I-NEXT: sll t3, t3, a1 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: srli a3, a4, 1 +; RV32I-NEXT: srl a7, a3, a7 +; RV32I-NEXT: srli a3, t3, 1 +; RV32I-NEXT: srl t2, a3, t2 +; RV32I-NEXT: sll a3, t5, a1 ; RV32I-NEXT: sll t0, t0, a1 ; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t4, t4, a1 -; RV32I-NEXT: sll t5, t5, a1 +; RV32I-NEXT: sll t3, t3, a1 +; RV32I-NEXT: sll t5, s0, a1 ; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a5, a5, a1 +; RV32I-NEXT: sll a4, a4, a1 ; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a5, 24 +; RV32I-NEXT: srli a6, a4, 24 ; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a5, 16 +; RV32I-NEXT: srli a6, a4, 16 ; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a5, t2 -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 25(a2) -; RV32I-NEXT: srli a5, t6, 24 -; RV32I-NEXT: sb a5, 31(a2) -; RV32I-NEXT: srli a5, t6, 16 -; RV32I-NEXT: sb a5, 30(a2) -; RV32I-NEXT: or a5, t6, a7 +; RV32I-NEXT: or a6, a4, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t6, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, t6, 16 +; RV32I-NEXT: sb a4, 30(a2) +; RV32I-NEXT: or a4, t6, a7 ; RV32I-NEXT: srli a7, t6, 8 ; RV32I-NEXT: sb a7, 29(a2) ; RV32I-NEXT: srli a7, t5, 24 @@ -2696,25 +2696,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a7, t5, s1 ; RV32I-NEXT: srli t2, t5, 8 ; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t4, 24 +; RV32I-NEXT: srli t2, t3, 24 ; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t4, 16 +; RV32I-NEXT: srli t2, t3, 16 ; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t4, s2 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 21(a2) -; RV32I-NEXT: srli t4, t1, 24 -; RV32I-NEXT: sb t4, 11(a2) -; RV32I-NEXT: srli t4, t1, 16 -; RV32I-NEXT: sb t4, 10(a2) -; RV32I-NEXT: or a4, t1, a4 +; RV32I-NEXT: or t2, t3, s2 +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, t1, 24 +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: srli t3, t1, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, t1, t4 ; RV32I-NEXT: srli t1, t1, 8 ; RV32I-NEXT: sb t1, 9(a2) ; RV32I-NEXT: srli t1, t0, 24 ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli t1, t0, 16 ; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or a5, t0, a5 ; RV32I-NEXT: srli t0, t0, 8 ; RV32I-NEXT: sb t0, 13(a2) ; RV32I-NEXT: srli t0, a1, 24 @@ -2724,19 +2724,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, t3, 24 +; RV32I-NEXT: srli a1, a3, 24 ; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, t3, 16 +; RV32I-NEXT: srli a1, a3, 16 ; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, t3, a0 -; RV32I-NEXT: srli a1, t3, 8 -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 5(a2) ; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a4, 28(a2) ; RV32I-NEXT: sb a7, 16(a2) ; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb t3, 8(a2) +; RV32I-NEXT: sb a5, 12(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2776,7 +2776,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t0, 31(a0) +; RV64I-NEXT: lbu t1, 31(a0) ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 1(a0) @@ -2789,31 +2789,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 5(a0) ; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t2, 6(a0) -; RV64I-NEXT: lbu t3, 7(a0) -; RV64I-NEXT: lbu t4, 8(a0) -; RV64I-NEXT: lbu t5, 9(a0) -; RV64I-NEXT: lbu t6, 10(a0) -; RV64I-NEXT: lbu s0, 11(a0) -; RV64I-NEXT: lbu s1, 12(a0) -; RV64I-NEXT: lbu s2, 13(a0) -; RV64I-NEXT: lbu s3, 14(a0) -; RV64I-NEXT: lbu s4, 15(a0) -; RV64I-NEXT: lbu s5, 16(a0) -; RV64I-NEXT: lbu s6, 17(a0) -; RV64I-NEXT: lbu s7, 18(a0) -; RV64I-NEXT: lbu s8, 19(a0) -; RV64I-NEXT: lbu s9, 1(a1) +; RV64I-NEXT: lbu t3, 6(a0) +; RV64I-NEXT: lbu t4, 7(a0) +; RV64I-NEXT: lbu t5, 8(a0) +; RV64I-NEXT: lbu t6, 9(a0) +; RV64I-NEXT: lbu s0, 10(a0) +; RV64I-NEXT: lbu s1, 11(a0) +; RV64I-NEXT: lbu s2, 12(a0) +; RV64I-NEXT: lbu s3, 13(a0) +; RV64I-NEXT: lbu s4, 14(a0) +; RV64I-NEXT: lbu s5, 15(a0) +; RV64I-NEXT: lbu s6, 16(a0) +; RV64I-NEXT: lbu s7, 17(a0) +; RV64I-NEXT: lbu s8, 18(a0) +; RV64I-NEXT: lbu s9, 19(a0) +; RV64I-NEXT: lbu a3, 1(a1) ; RV64I-NEXT: lbu s10, 0(a1) ; RV64I-NEXT: lbu s11, 2(a1) ; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, s10 ; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 +; RV64I-NEXT: or a3, s11, a3 ; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) @@ -2827,8 +2827,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t1, a1, s9 -; RV64I-NEXT: lbu s9, 23(a0) +; RV64I-NEXT: or t2, a1, a3 +; RV64I-NEXT: lbu t0, 23(a0) ; RV64I-NEXT: lbu a7, 24(a0) ; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a5, 26(a0) @@ -2843,26 +2843,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) ; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb s9, 79(sp) +; RV64I-NEXT: sb t0, 79(sp) ; RV64I-NEXT: sb s10, 78(sp) ; RV64I-NEXT: sb ra, 77(sp) ; RV64I-NEXT: sb s11, 76(sp) -; RV64I-NEXT: sb s8, 75(sp) -; RV64I-NEXT: sb s7, 74(sp) -; RV64I-NEXT: sb s6, 73(sp) -; RV64I-NEXT: sb s5, 72(sp) -; RV64I-NEXT: sb s4, 71(sp) -; RV64I-NEXT: sb s3, 70(sp) -; RV64I-NEXT: sb s2, 69(sp) -; RV64I-NEXT: sb s1, 68(sp) -; RV64I-NEXT: sb s0, 67(sp) -; RV64I-NEXT: sb t6, 66(sp) -; RV64I-NEXT: sb t5, 65(sp) -; RV64I-NEXT: sb t4, 64(sp) -; RV64I-NEXT: sb t0, 87(sp) -; RV64I-NEXT: slli t0, t0, 56 -; RV64I-NEXT: sb t3, 63(sp) -; RV64I-NEXT: sb t2, 62(sp) +; RV64I-NEXT: sb s9, 75(sp) +; RV64I-NEXT: sb s8, 74(sp) +; RV64I-NEXT: sb s7, 73(sp) +; RV64I-NEXT: sb s6, 72(sp) +; RV64I-NEXT: sb s5, 71(sp) +; RV64I-NEXT: sb s4, 70(sp) +; RV64I-NEXT: sb s3, 69(sp) +; RV64I-NEXT: sb s2, 68(sp) +; RV64I-NEXT: sb s1, 67(sp) +; RV64I-NEXT: sb s0, 66(sp) +; RV64I-NEXT: sb t6, 65(sp) +; RV64I-NEXT: sb t5, 64(sp) +; RV64I-NEXT: sb t1, 87(sp) +; RV64I-NEXT: slli t1, t1, 56 +; RV64I-NEXT: sb t4, 63(sp) +; RV64I-NEXT: sb t3, 62(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 61(sp) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload @@ -2875,7 +2875,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t0, 63 +; RV64I-NEXT: srai a0, t1, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2915,47 +2915,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: slli a0, t1, 56 +; RV64I-NEXT: slli a0, t2, 56 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: lbu a0, 9(a1) +; RV64I-NEXT: lbu a3, 8(a1) +; RV64I-NEXT: lbu a4, 10(a1) +; RV64I-NEXT: lbu a5, 11(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: lbu a3, 13(a1) +; RV64I-NEXT: lbu a4, 12(a1) +; RV64I-NEXT: lbu a5, 14(a1) +; RV64I-NEXT: lbu a6, 15(a1) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 -; RV64I-NEXT: andi a1, t1, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a4, a3, a0 +; RV64I-NEXT: andi a3, t2, 7 +; RV64I-NEXT: lbu a0, 17(a1) +; RV64I-NEXT: lbu a5, 16(a1) +; RV64I-NEXT: lbu a6, 18(a1) +; RV64I-NEXT: lbu a7, 19(a1) ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) +; RV64I-NEXT: lbu a5, 21(a1) +; RV64I-NEXT: lbu a6, 20(a1) +; RV64I-NEXT: lbu a7, 22(a1) +; RV64I-NEXT: lbu t0, 23(a1) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -2965,22 +2965,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a5, a5, 32 ; RV64I-NEXT: or a5, a5, a0 ; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 +; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu t2, 7(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 @@ -2989,98 +2989,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) +; RV64I-NEXT: lbu a7, 25(a1) +; RV64I-NEXT: lbu t0, 24(a1) +; RV64I-NEXT: lbu t1, 26(a1) +; RV64I-NEXT: lbu t2, 27(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) +; RV64I-NEXT: lbu t0, 29(a1) +; RV64I-NEXT: lbu t1, 28(a1) +; RV64I-NEXT: lbu t2, 30(a1) +; RV64I-NEXT: lbu a1, 31(a1) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 -; RV64I-NEXT: xori t0, a1, 63 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: xori t0, a3, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: srl a6, a6, a1 -; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: sra a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a7, a1, a7 +; RV64I-NEXT: slli a1, a7, 1 +; RV64I-NEXT: sll t0, a1, t0 +; RV64I-NEXT: srl a1, a4, a3 +; RV64I-NEXT: srl a4, a6, a3 +; RV64I-NEXT: srl a5, a5, a3 +; RV64I-NEXT: sra a3, a7, a3 +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 21(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 19(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: or a6, a5, t0 ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: srli a5, a3, 56 ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: srli a5, a3, 40 ; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srli a5, a3, 32 ; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: srli a5, a3, 24 ; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: srli a5, a3, 16 ; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a6, 40 -; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a6, 32 -; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a6, 24 -; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a6, 16 -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a6, t1 -; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, a4, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a4, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: or a3, a4, t1 +; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a4, a1, 48 +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: srli a4, a1, 40 +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: srli a4, a1, 32 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: srli a4, a1, 24 +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: srli a4, a1, 16 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: srli a1, a6, 56 +; RV64I-NEXT: sb a1, 23(a2) ; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 23(a2) -; RV64I-NEXT: srli a1, a1, 56 -; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: sb a3, 7(a2) ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: sb a0, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload @@ -3141,20 +3141,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s6, 16(a0) ; RV32I-NEXT: lbu s7, 17(a0) ; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: lbu s10, 20(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or a4, a4, s11 +; RV32I-NEXT: or a3, a3, s11 ; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: or t1, a1, a4 +; RV32I-NEXT: or t1, a1, a3 ; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: lbu a7, 24(a0) ; RV32I-NEXT: lbu a6, 25(a0) @@ -3240,23 +3240,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 61(sp) ; RV32I-NEXT: slli a0, t1, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a3, sp, 28 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a1, 4(a3) -; RV32I-NEXT: lbu a4, 6(a3) -; RV32I-NEXT: lbu a5, 7(a3) +; RV32I-NEXT: addi a4, sp, 28 +; RV32I-NEXT: add a4, a4, a0 +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a3, 6(a4) +; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: or t4, a4, a0 -; RV32I-NEXT: andi a4, t1, 7 -; RV32I-NEXT: lbu a0, 9(a3) -; RV32I-NEXT: lbu a1, 8(a3) -; RV32I-NEXT: lbu a5, 10(a3) -; RV32I-NEXT: lbu a6, 11(a3) +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t5, a3, a0 +; RV32I-NEXT: andi a3, t1, 7 +; RV32I-NEXT: lbu a0, 9(a4) +; RV32I-NEXT: lbu a1, 8(a4) +; RV32I-NEXT: lbu a5, 10(a4) +; RV32I-NEXT: lbu a6, 11(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 @@ -3264,146 +3264,146 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: sll a0, a0, t0 -; RV32I-NEXT: lbu a1, 1(a3) -; RV32I-NEXT: lbu a5, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t1, 3(a3) +; RV32I-NEXT: not t1, a3 +; RV32I-NEXT: sll a0, a0, t1 +; RV32I-NEXT: lbu a1, 1(a4) +; RV32I-NEXT: lbu a5, 0(a4) +; RV32I-NEXT: lbu a7, 2(a4) +; RV32I-NEXT: lbu t0, 3(a4) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 -; RV32I-NEXT: xori t2, a4, 31 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t5, 1 +; RV32I-NEXT: xori t2, a3, 31 ; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a3) -; RV32I-NEXT: lbu a7, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu t5, 15(a3) +; RV32I-NEXT: lbu a5, 13(a4) +; RV32I-NEXT: lbu a7, 12(a4) +; RV32I-NEXT: lbu t3, 14(a4) +; RV32I-NEXT: lbu t4, 15(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a3) -; RV32I-NEXT: lbu a7, 16(a3) -; RV32I-NEXT: lbu t5, 18(a3) -; RV32I-NEXT: lbu t6, 19(a3) +; RV32I-NEXT: lbu a5, 17(a4) +; RV32I-NEXT: lbu a7, 16(a4) +; RV32I-NEXT: lbu t4, 18(a4) +; RV32I-NEXT: lbu t6, 19(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: lbu t5, 21(a3) -; RV32I-NEXT: lbu t6, 20(a3) -; RV32I-NEXT: lbu s0, 22(a3) -; RV32I-NEXT: lbu s1, 23(a3) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: or t4, a7, a5 +; RV32I-NEXT: slli a5, t4, 1 +; RV32I-NEXT: sll a7, a5, t1 +; RV32I-NEXT: lbu a5, 21(a4) +; RV32I-NEXT: lbu t6, 20(a4) +; RV32I-NEXT: lbu s0, 22(a4) +; RV32I-NEXT: lbu s1, 23(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 25(a3) -; RV32I-NEXT: lbu s0, 24(a3) -; RV32I-NEXT: lbu s1, 26(a3) -; RV32I-NEXT: lbu s2, 27(a3) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: or s0, s0, a5 +; RV32I-NEXT: lbu a5, 25(a4) +; RV32I-NEXT: lbu t6, 24(a4) +; RV32I-NEXT: lbu s1, 26(a4) +; RV32I-NEXT: lbu s2, 27(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: lbu s0, 29(a3) -; RV32I-NEXT: lbu s1, 28(a3) +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: or t6, t6, a5 +; RV32I-NEXT: lbu a5, 29(a4) +; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 30(a3) -; RV32I-NEXT: lbu a3, 31(a3) +; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: lbu s1, 30(a4) +; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 ; RV32I-NEXT: sll s2, s2, t2 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, s1 -; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: slli s1, s0, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or a3, a3, s0 -; RV32I-NEXT: slli s0, a3, 1 -; RV32I-NEXT: sll t2, s0, t2 -; RV32I-NEXT: srl t4, t4, a4 -; RV32I-NEXT: srl t1, t1, a4 -; RV32I-NEXT: srl t3, t3, a4 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: srl t5, t5, a4 -; RV32I-NEXT: srl a5, a5, a4 -; RV32I-NEXT: srl t6, t6, a4 -; RV32I-NEXT: sra a3, a3, a4 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: or s3, a4, a5 +; RV32I-NEXT: slli a4, s3, 1 +; RV32I-NEXT: sll t2, a4, t2 +; RV32I-NEXT: srl a4, t5, a3 +; RV32I-NEXT: srl a5, t0, a3 +; RV32I-NEXT: srl t0, t3, a3 +; RV32I-NEXT: srl a6, a6, a3 +; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t4, t4, a3 +; RV32I-NEXT: srl t5, t6, a3 +; RV32I-NEXT: sra a3, s3, a3 +; RV32I-NEXT: srli t6, t5, 16 +; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: or t2, t5, t2 +; RV32I-NEXT: sb t5, 24(a2) +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: sb t5, 25(a2) +; RV32I-NEXT: srli t5, a3, 24 +; RV32I-NEXT: sb t5, 31(a2) +; RV32I-NEXT: srli t5, a3, 16 +; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, t4, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or s1, a5, s1 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 +; RV32I-NEXT: or a3, t4, s1 +; RV32I-NEXT: sb t4, 16(a2) +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb t4, 17(a2) +; RV32I-NEXT: srli t4, t3, 16 +; RV32I-NEXT: sb t4, 22(a2) +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: srli a7, t3, 8 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: srli a7, t0, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 +; RV32I-NEXT: srli a7, a5, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) -; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: srli a4, t2, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: sb s1, 19(a2) ; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a1, a1, 24 diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll index e274671f88757..ae1de443bce05 100644 --- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -11,27 +11,27 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .cfi_window_save ; SPARC-NEXT: .cfi_register %o7, %i7 ; SPARC-NEXT: ld [%fp+96], %l1 -; SPARC-NEXT: mov %i3, %l0 +; SPARC-NEXT: mov %i3, %g4 ; SPARC-NEXT: mov %i2, %g2 ; SPARC-NEXT: umul %i3, %l1, %i3 ; SPARC-NEXT: rd %y, %i2 ; SPARC-NEXT: ld [%fp+92], %l2 ; SPARC-NEXT: umul %g2, %l1, %g3 -; SPARC-NEXT: rd %y, %g4 +; SPARC-NEXT: rd %y, %l0 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %g4, 0, %g3 -; SPARC-NEXT: umul %l0, %l2, %g4 +; SPARC-NEXT: addxcc %l0, 0, %g3 +; SPARC-NEXT: umul %g4, %l2, %l0 ; SPARC-NEXT: rd %y, %l3 -; SPARC-NEXT: addcc %g4, %i2, %i2 -; SPARC-NEXT: addxcc %l3, 0, %g4 -; SPARC-NEXT: addcc %g3, %g4, %g3 -; SPARC-NEXT: addxcc %g0, 0, %g4 +; SPARC-NEXT: addcc %l0, %i2, %i2 +; SPARC-NEXT: addxcc %l3, 0, %l0 +; SPARC-NEXT: addcc %g3, %l0, %g3 +; SPARC-NEXT: addxcc %g0, 0, %l0 ; SPARC-NEXT: umul %g2, %l2, %l3 ; SPARC-NEXT: rd %y, %l4 ; SPARC-NEXT: addcc %l3, %g3, %g3 ; SPARC-NEXT: umul %i1, %l1, %l3 ; SPARC-NEXT: rd %y, %l5 -; SPARC-NEXT: addxcc %l4, %g4, %g4 +; SPARC-NEXT: addxcc %l4, %l0, %l0 ; SPARC-NEXT: umul %i0, %l1, %l4 ; SPARC-NEXT: rd %y, %l6 ; SPARC-NEXT: addcc %l4, %l5, %l4 @@ -47,16 +47,16 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, %l6, %l6 ; SPARC-NEXT: addcc %l3, %g3, %g3 -; SPARC-NEXT: addxcc %l4, %g4, %g4 +; SPARC-NEXT: addxcc %l4, %l0, %l0 ; SPARC-NEXT: addxcc %l5, 0, %l3 -; SPARC-NEXT: umul %l0, %i5, %l4 +; SPARC-NEXT: umul %g4, %i5, %l4 ; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addxcc %l6, 0, %l6 ; SPARC-NEXT: umul %g2, %i5, %l7 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, 0, %l7 -; SPARC-NEXT: umul %l0, %i4, %o0 +; SPARC-NEXT: umul %g4, %i4, %o0 ; SPARC-NEXT: rd %y, %o1 ; SPARC-NEXT: addcc %o0, %l5, %l5 ; SPARC-NEXT: addxcc %o1, 0, %o0 @@ -67,7 +67,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %o1, %l7, %l7 ; SPARC-NEXT: addxcc %o2, %o0, %o0 ; SPARC-NEXT: addcc %l4, %g3, %g3 -; SPARC-NEXT: addxcc %l5, %g4, %g4 +; SPARC-NEXT: addxcc %l5, %l0, %l0 ; SPARC-NEXT: addxcc %l7, 0, %l4 ; SPARC-NEXT: addxcc %o0, 0, %l5 ; SPARC-NEXT: addcc %l3, %l4, %l3 @@ -118,21 +118,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %o0, %o3, %l6 ; SPARC-NEXT: addcc %l2, %o1, %l2 ; SPARC-NEXT: sra %i4, 31, %i4 -; SPARC-NEXT: umul %l0, %i4, %l0 +; SPARC-NEXT: umul %g4, %i4, %g4 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addxcc %l6, %l7, %l6 ; SPARC-NEXT: umul %i4, %g2, %g2 ; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: add %o0, %l0, %o1 +; SPARC-NEXT: add %o0, %g4, %o1 ; SPARC-NEXT: smul %i0, %i4, %i0 ; SPARC-NEXT: umul %i1, %i4, %i1 ; SPARC-NEXT: rd %y, %i4 ; SPARC-NEXT: add %o1, %g2, %o1 ; SPARC-NEXT: add %i4, %i1, %i4 ; SPARC-NEXT: add %i4, %i0, %i0 -; SPARC-NEXT: addcc %i1, %l0, %i1 +; SPARC-NEXT: addcc %i1, %g4, %i1 ; SPARC-NEXT: addxcc %i0, %o1, %i0 -; SPARC-NEXT: addcc %l0, %o0, %i4 +; SPARC-NEXT: addcc %g4, %o0, %i4 ; SPARC-NEXT: addxcc %o0, 0, %o0 ; SPARC-NEXT: addcc %g2, %i4, %i4 ; SPARC-NEXT: addxcc %l7, 0, %o1 @@ -142,7 +142,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l7, %o1, %l7 ; SPARC-NEXT: addcc %g2, %i1, %i1 ; SPARC-NEXT: addxcc %l7, %i0, %i0 -; SPARC-NEXT: addcc %l0, %l1, %g2 +; SPARC-NEXT: addcc %g4, %l1, %g2 ; SPARC-NEXT: addxcc %i4, %o2, %i4 ; SPARC-NEXT: addxcc %i1, %l2, %i1 ; SPARC-NEXT: addxcc %i0, %l6, %i0 @@ -150,7 +150,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l4, %i4, %i4 ; SPARC-NEXT: addxcc %l5, %i1, %i1 ; SPARC-NEXT: addxcc %i5, %i0, %i0 -; SPARC-NEXT: sra %g4, 31, %i5 +; SPARC-NEXT: sra %l0, 31, %i5 ; SPARC-NEXT: xor %i0, %i5, %i0 ; SPARC-NEXT: xor %i4, %i5, %i4 ; SPARC-NEXT: or %i4, %i0, %i0 @@ -167,7 +167,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .LBB0_2: ; SPARC-NEXT: mov 1, %i4 ; SPARC-NEXT: .LBB0_3: ! %start -; SPARC-NEXT: mov %g4, %i0 +; SPARC-NEXT: mov %l0, %i0 ; SPARC-NEXT: ret ; SPARC-NEXT: restore %g0, %g3, %o1 ; @@ -226,13 +226,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %g0, %o2 ; SPARC64-NEXT: call __multi3 ; SPARC64-NEXT: mov %i2, %o3 -; SPARC64-NEXT: srlx %o1, 32, %i3 -; SPARC64-NEXT: srlx %o0, 32, %g2 -; SPARC64-NEXT: addcc %o1, %i5, %i5 -; SPARC64-NEXT: addxcc %i3, %i4, %i3 -; SPARC64-NEXT: addxcc %o0, 0, %i4 -; SPARC64-NEXT: addxcc %g2, 0, %g2 -; SPARC64-NEXT: addcc %l4, %i4, %i4 +; SPARC64-NEXT: srlx %o1, 32, %g2 +; SPARC64-NEXT: srlx %o0, 32, %g3 +; SPARC64-NEXT: addcc %o1, %i5, %i3 +; SPARC64-NEXT: addxcc %g2, %i4, %i4 +; SPARC64-NEXT: addxcc %o0, 0, %i5 +; SPARC64-NEXT: addxcc %g3, 0, %g2 +; SPARC64-NEXT: addcc %l4, %i5, %i5 ; SPARC64-NEXT: addxcc %l5, %g2, %l4 ; SPARC64-NEXT: addxcc %g0, 0, %l5 ; SPARC64-NEXT: addxcc %g0, 0, %l6 @@ -243,29 +243,29 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %i2, %o3 ; SPARC64-NEXT: mov %g0, %i2 ; SPARC64-NEXT: srlx %o1, 32, %i0 -; SPARC64-NEXT: addcc %o1, %i4, %i4 +; SPARC64-NEXT: addcc %o1, %i5, %i5 ; SPARC64-NEXT: srlx %o0, 32, %g2 ; SPARC64-NEXT: addxcc %i0, %l4, %i0 ; SPARC64-NEXT: addxcc %o0, %l5, %g3 ; SPARC64-NEXT: addxcc %g2, %l6, %g2 -; SPARC64-NEXT: addcc %i4, %l0, %i4 +; SPARC64-NEXT: addcc %i5, %l0, %i5 ; SPARC64-NEXT: addxcc %i0, %l1, %i0 ; SPARC64-NEXT: addxcc %g3, %l2, %g3 ; SPARC64-NEXT: addxcc %g2, %l3, %g2 ; SPARC64-NEXT: srl %g3, 0, %g3 ; SPARC64-NEXT: sllx %g2, 32, %g2 ; SPARC64-NEXT: or %g2, %g3, %g2 -; SPARC64-NEXT: sllx %i3, 32, %i3 -; SPARC64-NEXT: srax %i3, 63, %g3 +; SPARC64-NEXT: sllx %i4, 32, %i4 +; SPARC64-NEXT: srax %i4, 63, %g3 ; SPARC64-NEXT: xor %g2, %g3, %g2 -; SPARC64-NEXT: srl %i4, 0, %i4 +; SPARC64-NEXT: srl %i5, 0, %i5 ; SPARC64-NEXT: sllx %i0, 32, %i0 -; SPARC64-NEXT: or %i0, %i4, %i0 +; SPARC64-NEXT: or %i0, %i5, %i0 ; SPARC64-NEXT: xor %i0, %g3, %i0 ; SPARC64-NEXT: or %i0, %g2, %i0 ; SPARC64-NEXT: movrnz %i0, 1, %i2 -; SPARC64-NEXT: srl %i5, 0, %i0 -; SPARC64-NEXT: or %i3, %i0, %i0 +; SPARC64-NEXT: srl %i3, 0, %i0 +; SPARC64-NEXT: or %i4, %i0, %i0 ; SPARC64-NEXT: srl %i2, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll index 257e9723f2cf3..9ca895fe78073 100644 --- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll @@ -13,147 +13,147 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: mov %i3, %g2 ; SPARC-NEXT: mov %i2, %g4 ; SPARC-NEXT: umul %i2, %i5, %i2 -; SPARC-NEXT: rd %y, %o0 -; SPARC-NEXT: ld [%fp+92], %l6 +; SPARC-NEXT: rd %y, %l7 +; SPARC-NEXT: ld [%fp+92], %l4 ; SPARC-NEXT: umul %i4, %i3, %i3 -; SPARC-NEXT: rd %y, %o2 +; SPARC-NEXT: rd %y, %o1 ; SPARC-NEXT: ld [%fp+96], %g3 -; SPARC-NEXT: umul %i5, %g2, %l0 -; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: umul %l6, %i1, %l2 -; SPARC-NEXT: rd %y, %l3 +; SPARC-NEXT: umul %i5, %g2, %l3 +; SPARC-NEXT: rd %y, %o0 +; SPARC-NEXT: umul %l4, %i1, %l2 +; SPARC-NEXT: rd %y, %l1 ; SPARC-NEXT: add %i3, %i2, %i2 ; SPARC-NEXT: umul %i0, %g3, %i3 -; SPARC-NEXT: rd %y, %l5 -; SPARC-NEXT: add %l7, %i2, %o1 +; SPARC-NEXT: rd %y, %l6 +; SPARC-NEXT: add %o0, %i2, %o2 ; SPARC-NEXT: umul %i1, %g3, %i2 -; SPARC-NEXT: rd %y, %l1 +; SPARC-NEXT: rd %y, %l0 ; SPARC-NEXT: add %i3, %l2, %i3 -; SPARC-NEXT: add %l1, %i3, %l2 -; SPARC-NEXT: addcc %i2, %l0, %l0 +; SPARC-NEXT: add %l0, %i3, %l2 +; SPARC-NEXT: addcc %i2, %l3, %l3 ; SPARC-NEXT: umul %g2, %g3, %i3 ; SPARC-NEXT: rd %y, %i2 -; SPARC-NEXT: addxcc %l2, %o1, %o4 +; SPARC-NEXT: addxcc %l2, %o2, %o4 ; SPARC-NEXT: umul %g4, %g3, %g3 -; SPARC-NEXT: rd %y, %l4 +; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %l4, 0, %g3 -; SPARC-NEXT: umul %g2, %l6, %g2 -; SPARC-NEXT: rd %y, %l4 +; SPARC-NEXT: addxcc %l5, 0, %g3 +; SPARC-NEXT: umul %g2, %l4, %g2 +; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addcc %g2, %i2, %i2 -; SPARC-NEXT: addxcc %l4, 0, %g2 +; SPARC-NEXT: addxcc %l5, 0, %g2 ; SPARC-NEXT: addcc %g3, %g2, %g2 ; SPARC-NEXT: addxcc %g0, 0, %g3 -; SPARC-NEXT: umul %g4, %l6, %l4 +; SPARC-NEXT: umul %g4, %l4, %l5 ; SPARC-NEXT: rd %y, %o3 -; SPARC-NEXT: addcc %l4, %g2, %l4 +; SPARC-NEXT: addcc %l5, %g2, %l5 ; SPARC-NEXT: addxcc %o3, %g3, %o3 -; SPARC-NEXT: addcc %l4, %l0, %g2 +; SPARC-NEXT: addcc %l5, %l3, %g2 ; SPARC-NEXT: addxcc %o3, %o4, %g3 -; SPARC-NEXT: mov 1, %l0 +; SPARC-NEXT: mov 1, %l3 ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: bcs .LBB0_2 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l3, %o4 ; SPARC-NEXT: ! %bb.1: ! %start ; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_2: ! %start -; SPARC-NEXT: cmp %g2, %l4 +; SPARC-NEXT: cmp %g2, %l5 ; SPARC-NEXT: bcs .LBB0_4 -; SPARC-NEXT: mov %l0, %l4 +; SPARC-NEXT: mov %l3, %l5 ; SPARC-NEXT: ! %bb.3: ! %start -; SPARC-NEXT: mov %g0, %l4 +; SPARC-NEXT: mov %g0, %l5 ; SPARC-NEXT: .LBB0_4: ! %start ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: be .LBB0_6 ; SPARC-NEXT: nop ; SPARC-NEXT: ! %bb.5: ! %start -; SPARC-NEXT: mov %o4, %l4 +; SPARC-NEXT: mov %o4, %l5 ; SPARC-NEXT: .LBB0_6: ! %start ; SPARC-NEXT: cmp %g4, 0 ; SPARC-NEXT: bne .LBB0_8 -; SPARC-NEXT: mov %l0, %g4 +; SPARC-NEXT: mov %l3, %o3 ; SPARC-NEXT: ! %bb.7: ! %start -; SPARC-NEXT: mov %g0, %g4 +; SPARC-NEXT: mov %g0, %o3 ; SPARC-NEXT: .LBB0_8: ! %start ; SPARC-NEXT: cmp %i4, 0 ; SPARC-NEXT: bne .LBB0_10 -; SPARC-NEXT: mov %l0, %o3 +; SPARC-NEXT: mov %l3, %o4 ; SPARC-NEXT: ! %bb.9: ! %start -; SPARC-NEXT: mov %g0, %o3 +; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_10: ! %start -; SPARC-NEXT: cmp %o2, 0 +; SPARC-NEXT: cmp %o1, 0 ; SPARC-NEXT: bne .LBB0_12 -; SPARC-NEXT: mov %l0, %o2 +; SPARC-NEXT: mov %l3, %o1 ; SPARC-NEXT: ! %bb.11: ! %start -; SPARC-NEXT: mov %g0, %o2 +; SPARC-NEXT: mov %g0, %o1 ; SPARC-NEXT: .LBB0_12: ! %start -; SPARC-NEXT: cmp %o0, 0 +; SPARC-NEXT: cmp %l7, 0 ; SPARC-NEXT: bne .LBB0_14 -; SPARC-NEXT: mov %l0, %o0 +; SPARC-NEXT: mov %l3, %l7 ; SPARC-NEXT: ! %bb.13: ! %start -; SPARC-NEXT: mov %g0, %o0 +; SPARC-NEXT: mov %g0, %l7 ; SPARC-NEXT: .LBB0_14: ! %start -; SPARC-NEXT: cmp %o1, %l7 +; SPARC-NEXT: cmp %o2, %o0 ; SPARC-NEXT: bcs .LBB0_16 -; SPARC-NEXT: mov %l0, %l7 +; SPARC-NEXT: mov %l3, %g4 ; SPARC-NEXT: ! %bb.15: ! %start -; SPARC-NEXT: mov %g0, %l7 +; SPARC-NEXT: mov %g0, %g4 ; SPARC-NEXT: .LBB0_16: ! %start -; SPARC-NEXT: cmp %l6, 0 +; SPARC-NEXT: cmp %l4, 0 ; SPARC-NEXT: bne .LBB0_18 -; SPARC-NEXT: mov %l0, %l6 +; SPARC-NEXT: mov %l3, %l4 ; SPARC-NEXT: ! %bb.17: ! %start -; SPARC-NEXT: mov %g0, %l6 +; SPARC-NEXT: mov %g0, %l4 ; SPARC-NEXT: .LBB0_18: ! %start ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_20 -; SPARC-NEXT: mov %l0, %o1 +; SPARC-NEXT: mov %l3, %o0 ; SPARC-NEXT: ! %bb.19: ! %start -; SPARC-NEXT: mov %g0, %o1 +; SPARC-NEXT: mov %g0, %o0 ; SPARC-NEXT: .LBB0_20: ! %start -; SPARC-NEXT: cmp %l5, 0 +; SPARC-NEXT: cmp %l6, 0 ; SPARC-NEXT: bne .LBB0_22 -; SPARC-NEXT: mov %l0, %l5 +; SPARC-NEXT: mov %l3, %l6 ; SPARC-NEXT: ! %bb.21: ! %start -; SPARC-NEXT: mov %g0, %l5 +; SPARC-NEXT: mov %g0, %l6 ; SPARC-NEXT: .LBB0_22: ! %start -; SPARC-NEXT: and %o3, %g4, %g4 -; SPARC-NEXT: cmp %l3, 0 -; SPARC-NEXT: and %o1, %l6, %o1 +; SPARC-NEXT: and %o4, %o3, %o2 +; SPARC-NEXT: cmp %l1, 0 +; SPARC-NEXT: and %o0, %l4, %l4 ; SPARC-NEXT: bne .LBB0_24 -; SPARC-NEXT: mov %l0, %l3 +; SPARC-NEXT: mov %l3, %l1 ; SPARC-NEXT: ! %bb.23: ! %start -; SPARC-NEXT: mov %g0, %l3 +; SPARC-NEXT: mov %g0, %l1 ; SPARC-NEXT: .LBB0_24: ! %start -; SPARC-NEXT: or %g4, %o2, %l6 -; SPARC-NEXT: cmp %l2, %l1 -; SPARC-NEXT: or %o1, %l5, %l2 +; SPARC-NEXT: or %o2, %o1, %o0 +; SPARC-NEXT: cmp %l2, %l0 +; SPARC-NEXT: or %l4, %l6, %l4 ; SPARC-NEXT: bcs .LBB0_26 -; SPARC-NEXT: mov %l0, %g4 +; SPARC-NEXT: mov %l3, %l0 ; SPARC-NEXT: ! %bb.25: ! %start -; SPARC-NEXT: mov %g0, %g4 +; SPARC-NEXT: mov %g0, %l0 ; SPARC-NEXT: .LBB0_26: ! %start -; SPARC-NEXT: or %l6, %o0, %l1 +; SPARC-NEXT: or %o0, %l7, %l2 ; SPARC-NEXT: or %i5, %i4, %i4 ; SPARC-NEXT: cmp %i4, 0 -; SPARC-NEXT: or %l2, %l3, %l2 +; SPARC-NEXT: or %l4, %l1, %l1 ; SPARC-NEXT: bne .LBB0_28 -; SPARC-NEXT: mov %l0, %i4 +; SPARC-NEXT: mov %l3, %i4 ; SPARC-NEXT: ! %bb.27: ! %start ; SPARC-NEXT: mov %g0, %i4 ; SPARC-NEXT: .LBB0_28: ! %start -; SPARC-NEXT: or %l1, %l7, %i5 +; SPARC-NEXT: or %l2, %g4, %i5 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_30 -; SPARC-NEXT: or %l2, %g4, %i0 +; SPARC-NEXT: or %l1, %l0, %i0 ; SPARC-NEXT: ! %bb.29: ! %start -; SPARC-NEXT: mov %g0, %l0 +; SPARC-NEXT: mov %g0, %l3 ; SPARC-NEXT: .LBB0_30: ! %start -; SPARC-NEXT: and %l0, %i4, %i1 +; SPARC-NEXT: and %l3, %i4, %i1 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: or %i0, %i5, %i0 -; SPARC-NEXT: or %i0, %l4, %i0 +; SPARC-NEXT: or %i0, %l5, %i0 ; SPARC-NEXT: and %i0, 1, %i4 ; SPARC-NEXT: mov %g3, %i0 ; SPARC-NEXT: ret diff --git a/llvm/test/CodeGen/SystemZ/int-conv-01.ll b/llvm/test/CodeGen/SystemZ/int-conv-01.ll index 91ef0802223ed..491fb95cddf72 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-01.ll @@ -108,7 +108,7 @@ define i32 @f9(i64 %src, i64 %index) { ; to use LB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lb {{%r[0-9]+}}, 191(%r15) +; CHECK: lb {{%r[0-9]+}}, 183(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-02.ll b/llvm/test/CodeGen/SystemZ/int-conv-02.ll index 9134e5bdbe17b..6c33ee1098ff7 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-02.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llc {{%r[0-9]+}}, 187(%r15) +; CHECK: llc {{%r[0-9]+}}, 179(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-03.ll b/llvm/test/CodeGen/SystemZ/int-conv-03.ll index 9c364651edf70..41f2f87186a5e 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-03.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgb {{%r[0-9]+}}, 167(%r15) +; CHECK: lgb {{%r[0-9]+}}, 199(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-04.ll b/llvm/test/CodeGen/SystemZ/int-conv-04.ll index c589d0d0d4cfe..5c808920ff25e 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-04.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-04.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgc {{%r[0-9]+}}, 167(%r15) +; CHECK: llgc {{%r[0-9]+}}, 199(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-06.ll b/llvm/test/CodeGen/SystemZ/int-conv-06.ll index ff91c91f8f14c..1163e1e04ce6c 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-06.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-06.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llh {{%r[0-9]+}}, 186(%r15) +; CHECK: llh {{%r[0-9]+}}, 178(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-07.ll b/llvm/test/CodeGen/SystemZ/int-conv-07.ll index a36154fd78a8f..bc2895da2cde0 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-07.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-07.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGH if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgh {{%r[0-9]+}}, 166(%r15) +; CHECK: lgh {{%r[0-9]+}}, 198(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-08.ll b/llvm/test/CodeGen/SystemZ/int-conv-08.ll index 0abe41f299e75..82f2bcea4af78 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-08.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgh {{%r[0-9]+}}, 166(%r15) +; CHECK: llgh {{%r[0-9]+}}, 198(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll index 14ceee58ef55c..40207090fda66 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -241,11 +241,11 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrh.u16 q0, [r5] ; CHECK-NEXT: vshl.i16 q1, q0, #3 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmla.i16 q4, q1, r3 -; CHECK-NEXT: vmov.f64 d6, d4 -; CHECK-NEXT: vmov.f64 d7, d5 +; CHECK-NEXT: vmla.i16 q3, q1, r3 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d9, d5 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vshr.u16 q2, q0, #9 ; CHECK-NEXT: vshr.u16 q0, q0, #3 @@ -253,17 +253,17 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmla.i16 q1, q0, r3 ; CHECK-NEXT: vand q2, q2, q5 -; CHECK-NEXT: vshr.u16 q0, q4, #11 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vshr.u16 q0, q3, #11 +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vshr.u16 q1, q1, #5 -; CHECK-NEXT: vmla.i16 q4, q2, r3 +; CHECK-NEXT: vmla.i16 q3, q2, r3 ; CHECK-NEXT: vand q1, q1, q7 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vand q1, q4, q6 +; CHECK-NEXT: vand q1, q3, q6 ; CHECK-NEXT: vorr q0, q0, q1 ; CHECK-NEXT: vstrh.16 q0, [r5], #16 -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d5, d7 +; CHECK-NEXT: vmov.f64 d4, d8 +; CHECK-NEXT: vmov.f64 d5, d9 ; CHECK-NEXT: letp lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index 3a477f987cee6..c299b62a4c942 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: @@ -1087,37 +1087,21 @@ entry: } define arm_aapcs_vfpcc <2 x double> @copysign_float64_t(<2 x double> %src1, <2 x double> %src2) { -; CHECK-LV-LABEL: copysign_float64_t: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r7, lr} -; CHECK-LV-NEXT: push {r7, lr} -; CHECK-LV-NEXT: vmov r0, r1, d3 -; CHECK-LV-NEXT: vmov r0, lr, d2 -; CHECK-LV-NEXT: vmov r0, r3, d1 -; CHECK-LV-NEXT: vmov r12, r2, d0 -; CHECK-LV-NEXT: lsrs r1, r1, #31 -; CHECK-LV-NEXT: bfi r3, r1, #31, #1 -; CHECK-LV-NEXT: lsr.w r1, lr, #31 -; CHECK-LV-NEXT: bfi r2, r1, #31, #1 -; CHECK-LV-NEXT: vmov d1, r0, r3 -; CHECK-LV-NEXT: vmov d0, r12, r2 -; CHECK-LV-NEXT: pop {r7, pc} -; -; CHECK-LIS-LABEL: copysign_float64_t: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, lr} -; CHECK-LIS-NEXT: push {r4, lr} -; CHECK-LIS-NEXT: vmov r0, r12, d3 -; CHECK-LIS-NEXT: vmov r0, lr, d2 -; CHECK-LIS-NEXT: vmov r4, r3, d1 -; CHECK-LIS-NEXT: vmov r1, r2, d0 -; CHECK-LIS-NEXT: lsr.w r0, r12, #31 -; CHECK-LIS-NEXT: bfi r3, r0, #31, #1 -; CHECK-LIS-NEXT: lsr.w r0, lr, #31 -; CHECK-LIS-NEXT: bfi r2, r0, #31, #1 -; CHECK-LIS-NEXT: vmov d1, r4, r3 -; CHECK-LIS-NEXT: vmov d0, r1, r2 -; CHECK-LIS-NEXT: pop {r4, pc} +; CHECK-LABEL: copysign_float64_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov r12, r2, d0 +; CHECK-NEXT: lsrs r1, r1, #31 +; CHECK-NEXT: bfi r3, r1, #31, #1 +; CHECK-NEXT: lsr.w r1, lr, #31 +; CHECK-NEXT: bfi r2, r1, #31, #1 +; CHECK-NEXT: vmov d1, r0, r3 +; CHECK-NEXT: vmov d0, r12, r2 +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %src1, <2 x double> %src2) ret <2 x double> %0 @@ -1153,4 +1137,3 @@ declare <2 x double> @llvm.log2.v2f64(<2 x double>) declare <2 x double> @llvm.log10.v2f64(<2 x double>) declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) - diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 6e644c58687fa..f4643f8c6c4a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -223,31 +223,18 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { -; CHECK-LV-LABEL: shuffle3_i16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmovx.f16 s2, s5 -; CHECK-LV-NEXT: vmovx.f16 s0, s4 -; CHECK-LV-NEXT: vins.f16 s5, s4 -; CHECK-LV-NEXT: vins.f16 s2, s0 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmovx.f16 s1, s7 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vins.f16 s1, s7 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_i16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmovx.f16 s5, s3 -; CHECK-LIS-NEXT: vmovx.f16 s6, s1 -; CHECK-LIS-NEXT: vmovx.f16 s4, s0 -; CHECK-LIS-NEXT: vins.f16 s1, s0 -; CHECK-LIS-NEXT: vins.f16 s6, s4 -; CHECK-LIS-NEXT: vins.f16 s5, s3 -; CHECK-LIS-NEXT: vmov.f32 s7, s1 -; CHECK-LIS-NEXT: vmov.f32 s4, s2 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> ret <8 x i16> %out @@ -491,79 +478,42 @@ entry: } define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { -; CHECK-LV-LABEL: shuffle3_i8: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmov.u8 r0, q0[4] -; CHECK-LV-NEXT: vmov.8 q0[0], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[5] -; CHECK-LV-NEXT: vmov.8 q0[1], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[15] -; CHECK-LV-NEXT: vmov.8 q0[2], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[7] -; CHECK-LV-NEXT: vmov.8 q0[3], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[14] -; CHECK-LV-NEXT: vmov.8 q0[4], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[9] -; CHECK-LV-NEXT: vmov.8 q0[5], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[6] -; CHECK-LV-NEXT: vmov.8 q0[6], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[3] -; CHECK-LV-NEXT: vmov.8 q0[7], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[10] -; CHECK-LV-NEXT: vmov.8 q0[8], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[12] -; CHECK-LV-NEXT: vmov.8 q0[9], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[1] -; CHECK-LV-NEXT: vmov.8 q0[10], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[13] -; CHECK-LV-NEXT: vmov.8 q0[11], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[2] -; CHECK-LV-NEXT: vmov.8 q0[12], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[8] -; CHECK-LV-NEXT: vmov.8 q0[13], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[0] -; CHECK-LV-NEXT: vmov.8 q0[14], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[11] -; CHECK-LV-NEXT: vmov.8 q0[15], r0 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_i8: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmov.u8 r0, q0[4] -; CHECK-LIS-NEXT: vmov.8 q1[0], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[5] -; CHECK-LIS-NEXT: vmov.8 q1[1], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[15] -; CHECK-LIS-NEXT: vmov.8 q1[2], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[7] -; CHECK-LIS-NEXT: vmov.8 q1[3], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[14] -; CHECK-LIS-NEXT: vmov.8 q1[4], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[9] -; CHECK-LIS-NEXT: vmov.8 q1[5], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[6] -; CHECK-LIS-NEXT: vmov.8 q1[6], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[3] -; CHECK-LIS-NEXT: vmov.8 q1[7], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[10] -; CHECK-LIS-NEXT: vmov.8 q1[8], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[12] -; CHECK-LIS-NEXT: vmov.8 q1[9], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[1] -; CHECK-LIS-NEXT: vmov.8 q1[10], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[13] -; CHECK-LIS-NEXT: vmov.8 q1[11], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[2] -; CHECK-LIS-NEXT: vmov.8 q1[12], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[8] -; CHECK-LIS-NEXT: vmov.8 q1[13], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[0] -; CHECK-LIS-NEXT: vmov.8 q1[14], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[11] -; CHECK-LIS-NEXT: vmov.8 q1[15], r0 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> ret <16 x i8> %out @@ -1195,31 +1145,18 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { -; CHECK-LV-LABEL: shuffle3_f16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmovx.f16 s2, s5 -; CHECK-LV-NEXT: vmovx.f16 s0, s4 -; CHECK-LV-NEXT: vins.f16 s5, s4 -; CHECK-LV-NEXT: vins.f16 s2, s0 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmovx.f16 s1, s7 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vins.f16 s1, s7 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_f16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmovx.f16 s5, s3 -; CHECK-LIS-NEXT: vmovx.f16 s6, s1 -; CHECK-LIS-NEXT: vmovx.f16 s4, s0 -; CHECK-LIS-NEXT: vins.f16 s1, s0 -; CHECK-LIS-NEXT: vins.f16 s6, s4 -; CHECK-LIS-NEXT: vins.f16 s5, s3 -; CHECK-LIS-NEXT: vmov.f32 s7, s1 -; CHECK-LIS-NEXT: vmov.f32 s4, s2 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> ret <8 x half> %out @@ -1530,27 +1467,47 @@ entry: ret <2 x double> %out } define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { -; CHECK-LABEL: shuffle9_f64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_f64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle9_f64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> ret <8 x double> %out @@ -1623,27 +1580,47 @@ entry: ret <2 x i64> %out } define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { -; CHECK-LABEL: shuffle9_i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle9_i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> ret <8 x i64> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 8a94e571e9836..ccdc996d75970 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -68,87 +68,46 @@ entry: } define void @vld3_v8i32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v8i32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v8i32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v8i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x i32>, ptr %src, align 4 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> @@ -161,155 +120,80 @@ entry: } define void @vld3_v16i32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16i32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 -; CHECK-LV-NEXT: vmov.f32 s18, s10 -; CHECK-LV-NEXT: vmov.f32 s21, s8 -; CHECK-LV-NEXT: vmov.f32 s22, s11 -; CHECK-LV-NEXT: vmov.f32 s16, s12 -; CHECK-LV-NEXT: vmov.f32 s17, s15 -; CHECK-LV-NEXT: vmov.f32 s20, s13 -; CHECK-LV-NEXT: vmov.f32 s23, s26 -; CHECK-LV-NEXT: vmov.f32 s19, s25 -; CHECK-LV-NEXT: vadd.i32 q4, q4, q5 -; CHECK-LV-NEXT: vmov.f32 s8, s14 -; CHECK-LV-NEXT: vmov.f32 s10, s24 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LV-NEXT: vmov.f32 s11, s27 -; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-LV-NEXT: vadd.i32 q2, q4, q2 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LV-NEXT: vmov.f32 s25, s12 -; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LV-NEXT: vmov.f32 s26, s15 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s30, s14 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vmov.f32 s24, s17 -; CHECK-LV-NEXT: vmov.f32 s27, s22 -; CHECK-LV-NEXT: vmov.f32 s28, s16 -; CHECK-LV-NEXT: vmov.f32 s29, s19 -; CHECK-LV-NEXT: vmov.f32 s31, s21 -; CHECK-LV-NEXT: vadd.i32 q6, q7, q6 -; CHECK-LV-NEXT: vmov.f32 s12, s18 -; CHECK-LV-NEXT: vmov.f32 s14, s20 -; CHECK-LV-NEXT: vmov.f32 s15, s23 -; CHECK-LV-NEXT: vadd.i32 q3, q6, q3 -; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16i32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LIS-NEXT: vmov.f32 s18, s10 -; CHECK-LIS-NEXT: vmov.f32 s21, s8 -; CHECK-LIS-NEXT: vmov.f32 s22, s11 -; CHECK-LIS-NEXT: vmov.f32 s16, s12 -; CHECK-LIS-NEXT: vmov.f32 s17, s15 -; CHECK-LIS-NEXT: vmov.f32 s20, s13 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s19, s25 -; CHECK-LIS-NEXT: vmov.f32 s8, s14 -; CHECK-LIS-NEXT: vadd.i32 q4, q4, q5 -; CHECK-LIS-NEXT: vmov.f32 s10, s24 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LIS-NEXT: vmov.f32 s11, s27 -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] -; CHECK-LIS-NEXT: vadd.i32 q2, q4, q2 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s30, s14 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s28, s16 -; CHECK-LIS-NEXT: vmov.f32 s29, s19 -; CHECK-LIS-NEXT: vmov.f32 s31, s25 -; CHECK-LIS-NEXT: vadd.i32 q5, q7, q5 -; CHECK-LIS-NEXT: vmov.f32 s12, s18 -; CHECK-LIS-NEXT: vmov.f32 s14, s24 -; CHECK-LIS-NEXT: vmov.f32 s15, s27 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s23, s26 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vadd.i32 q2, q4, q2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vmov.f32 s31, s21 +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vadd.i32 q3, q6, q3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x i32>, ptr %src, align 4 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> @@ -364,59 +248,32 @@ entry: } define void @vld3_v4i16(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v4i16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r4, r5, r6, lr} -; CHECK-LV-NEXT: push {r4, r5, r6, lr} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.u16 r5, q0[6] -; CHECK-LV-NEXT: vmov.u16 r6, q0[0] -; CHECK-LV-NEXT: vmov r0, r3, d2 -; CHECK-LV-NEXT: vmov.u16 lr, q0[2] -; CHECK-LV-NEXT: vmov r2, r4, d3 -; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-LV-NEXT: vmov.u16 r5, q0[7] -; CHECK-LV-NEXT: vmov.u16 r6, q0[1] -; CHECK-LV-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-LV-NEXT: vmov.u16 r5, q0[3] -; CHECK-LV-NEXT: vmov.u16 r6, q0[4] -; CHECK-LV-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-LV-NEXT: vmov q2[3], q2[1], r6, r2 -; CHECK-LV-NEXT: vmov.u16 r12, q0[5] -; CHECK-LV-NEXT: vadd.i32 q0, q1, q2 -; CHECK-LV-NEXT: vmov q1[2], q1[0], lr, r0 -; CHECK-LV-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-LV-NEXT: vadd.i32 q0, q0, q1 -; CHECK-LV-NEXT: vstrh.32 q0, [r1] -; CHECK-LV-NEXT: pop {r4, r5, r6, pc} -; -; CHECK-LIS-LABEL: vld3_v4i16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, r5, r6, lr} -; CHECK-LIS-NEXT: push {r4, r5, r6, lr} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.u16 r5, q0[6] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[0] -; CHECK-LIS-NEXT: vmov r0, r2, d2 -; CHECK-LIS-NEXT: vmov.u16 r12, q0[2] -; CHECK-LIS-NEXT: vmov r3, r4, d3 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-LIS-NEXT: vmov.u16 r5, q0[7] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[1] -; CHECK-LIS-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-LIS-NEXT: vmov.u16 r5, q0[3] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[4] -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r5, r2 -; CHECK-LIS-NEXT: vmov q2[3], q2[1], r6, r3 -; CHECK-LIS-NEXT: vmov.u16 lr, q0[5] -; CHECK-LIS-NEXT: vadd.i32 q0, q1, q2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r12, r0 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], lr, r4 -; CHECK-LIS-NEXT: vadd.i32 q0, q0, q1 -; CHECK-LIS-NEXT: vstrh.32 q0, [r1] -; CHECK-LIS-NEXT: pop {r4, r5, r6, pc} +; CHECK-LABEL: vld3_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u16 r5, q0[6] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: vmov.u16 lr, q0[2] +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[7] +; CHECK-NEXT: vmov.u16 r6, q0[1] +; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[3] +; CHECK-NEXT: vmov.u16 r6, q0[4] +; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 +; CHECK-NEXT: vmov.u16 r12, q0[5] +; CHECK-NEXT: vadd.i32 q0, q1, q2 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <12 x i16>, ptr %src, align 4 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> @@ -603,44 +460,44 @@ define void @vld3_v16i16(ptr %src, ptr %dst) { ; CHECK-LIS-NEXT: vmov.f32 s3, s13 ; CHECK-LIS-NEXT: vins.f16 s17, s9 ; CHECK-LIS-NEXT: vmov.f32 s2, s10 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-LIS-NEXT: vadd.i16 q0, q0, q4 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-LIS-NEXT: vadd.i16 q0, q0, q1 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LIS-NEXT: vmovx.f16 s10, s14 -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vins.f16 s10, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s4 -; CHECK-LIS-NEXT: vins.f16 s22, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s7 -; CHECK-LIS-NEXT: vmov.f32 s23, s6 -; CHECK-LIS-NEXT: vmovx.f16 s8, s16 -; CHECK-LIS-NEXT: vins.f16 s23, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s17 -; CHECK-LIS-NEXT: vins.f16 s16, s4 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0] +; CHECK-LIS-NEXT: vmovx.f16 s6, s18 +; CHECK-LIS-NEXT: vmov.f32 s22, s19 +; CHECK-LIS-NEXT: vins.f16 s6, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s8 +; CHECK-LIS-NEXT: vins.f16 s22, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s10 ; CHECK-LIS-NEXT: vmovx.f16 s4, s12 -; CHECK-LIS-NEXT: vmovx.f16 s9, s19 -; CHECK-LIS-NEXT: vins.f16 s19, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s15 -; CHECK-LIS-NEXT: vmovx.f16 s11, s5 -; CHECK-LIS-NEXT: vins.f16 s14, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s6 -; CHECK-LIS-NEXT: vins.f16 s8, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmovx.f16 s18, s18 -; CHECK-LIS-NEXT: vins.f16 s5, s4 -; CHECK-LIS-NEXT: vins.f16 s9, s13 -; CHECK-LIS-NEXT: vins.f16 s20, s18 -; CHECK-LIS-NEXT: vmov.f32 s17, s19 -; CHECK-LIS-NEXT: vins.f16 s11, s7 -; CHECK-LIS-NEXT: vmovx.f16 s13, s13 -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vmov.f32 s18, s14 -; CHECK-LIS-NEXT: vins.f16 s21, s13 -; CHECK-LIS-NEXT: vmov.f32 s19, s5 +; CHECK-LIS-NEXT: vins.f16 s23, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s13 +; CHECK-LIS-NEXT: vins.f16 s12, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s16 +; CHECK-LIS-NEXT: vmovx.f16 s5, s15 +; CHECK-LIS-NEXT: vins.f16 s15, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s19 +; CHECK-LIS-NEXT: vins.f16 s4, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s13 +; CHECK-LIS-NEXT: vmovx.f16 s14, s14 +; CHECK-LIS-NEXT: vins.f16 s18, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s10 +; CHECK-LIS-NEXT: vmovx.f16 s7, s9 +; CHECK-LIS-NEXT: vins.f16 s20, s14 +; CHECK-LIS-NEXT: vmovx.f16 s14, s17 +; CHECK-LIS-NEXT: vmov.f32 s21, s16 +; CHECK-LIS-NEXT: vins.f16 s9, s8 +; CHECK-LIS-NEXT: vins.f16 s21, s14 +; CHECK-LIS-NEXT: vmov.f32 s13, s15 +; CHECK-LIS-NEXT: vins.f16 s7, s11 +; CHECK-LIS-NEXT: vins.f16 s5, s17 +; CHECK-LIS-NEXT: vmov.f32 s14, s18 +; CHECK-LIS-NEXT: vmov.f32 s15, s9 ; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vadd.i16 q1, q4, q2 +; CHECK-LIS-NEXT: vadd.i16 q1, q3, q1 ; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5 ; CHECK-LIS-NEXT: vstrw.32 q1, [r1] ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} @@ -935,65 +792,35 @@ entry: ; i64 define void @vld3_v2i64(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v2i64: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s12, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s3 -; CHECK-LV-NEXT: vmov.f32 s2, s4 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmov r0, r3, d5 -; CHECK-LV-NEXT: vmov r2, r4, d3 -; CHECK-LV-NEXT: vmov r6, r7, d0 -; CHECK-LV-NEXT: vmov r5, r8, d6 -; CHECK-LV-NEXT: vmov lr, r12, d1 -; CHECK-LV-NEXT: adds.w r0, r0, lr -; CHECK-LV-NEXT: adc.w r3, r3, r12 -; CHECK-LV-NEXT: adds r0, r0, r2 -; CHECK-LV-NEXT: adc.w r2, r3, r4 -; CHECK-LV-NEXT: vmov r3, r4, d4 -; CHECK-LV-NEXT: adds r6, r6, r5 -; CHECK-LV-NEXT: adc.w r7, r7, r8 -; CHECK-LV-NEXT: adds r3, r3, r6 -; CHECK-LV-NEXT: adcs r7, r4 -; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-LV-NEXT: vstrw.32 q0, [r1] -; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} -; -; CHECK-LIS-LABEL: vld3_v2i64: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s12, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s3 -; CHECK-LIS-NEXT: vmov.f32 s2, s8 -; CHECK-LIS-NEXT: vmov.f32 s3, s9 -; CHECK-LIS-NEXT: vmov r0, r3, d3 -; CHECK-LIS-NEXT: vmov r2, r4, d5 -; CHECK-LIS-NEXT: vmov r6, r7, d0 -; CHECK-LIS-NEXT: vmov r5, r8, d6 -; CHECK-LIS-NEXT: vmov lr, r12, d1 -; CHECK-LIS-NEXT: adds.w r0, r0, lr -; CHECK-LIS-NEXT: adc.w r3, r3, r12 -; CHECK-LIS-NEXT: adds r0, r0, r2 -; CHECK-LIS-NEXT: adc.w r2, r3, r4 -; CHECK-LIS-NEXT: vmov r3, r4, d2 -; CHECK-LIS-NEXT: adds r6, r6, r5 -; CHECK-LIS-NEXT: adc.w r7, r7, r8 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r4 -; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1] -; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LABEL: vld3_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r3, r4 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: adc.w r7, r7, r8 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, ptr %src, align 4 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> @@ -1071,51 +898,50 @@ define void @vld3_v4i64(ptr %src, ptr %dst) { ; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] ; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-LIS-NEXT: vmov.f32 s8, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s2 ; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-LIS-NEXT: vmov.f32 s9, s3 +; CHECK-LIS-NEXT: vmov.f32 s5, s3 ; CHECK-LIS-NEXT: vmov.f32 s2, s12 ; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r2, r3, d3 -; CHECK-LIS-NEXT: vmov r4, r8, d7 +; CHECK-LIS-NEXT: vmov r5, r4, d5 +; CHECK-LIS-NEXT: vmov r3, r8, d7 ; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] ; CHECK-LIS-NEXT: vmov.f32 s24, s22 ; CHECK-LIS-NEXT: vmov.f32 s25, s23 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 ; CHECK-LIS-NEXT: vmov lr, r12, d1 ; CHECK-LIS-NEXT: vmov.f32 s2, s12 ; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r6, r7, d12 -; CHECK-LIS-NEXT: adds.w r0, r2, lr -; CHECK-LIS-NEXT: adc.w r2, r3, r12 -; CHECK-LIS-NEXT: adds.w lr, r0, r4 -; CHECK-LIS-NEXT: vmov r3, r5, d10 -; CHECK-LIS-NEXT: adc.w r12, r2, r8 -; CHECK-LIS-NEXT: vmov r2, r0, d8 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: adds r2, r2, r3 -; CHECK-LIS-NEXT: adc.w r8, r7, r0 -; CHECK-LIS-NEXT: vmov r6, r5, d1 -; CHECK-LIS-NEXT: vmov r3, r7, d3 -; CHECK-LIS-NEXT: vmov r4, r0, d0 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: vmov r6, r5, d7 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: vmov r6, r5, d4 -; CHECK-LIS-NEXT: adds r4, r4, r6 -; CHECK-LIS-NEXT: adcs r0, r5 -; CHECK-LIS-NEXT: vmov r5, r6, d2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LIS-NEXT: vmov r7, r6, d12 +; CHECK-LIS-NEXT: adds.w r0, r5, lr +; CHECK-LIS-NEXT: adc.w r5, r4, r12 +; CHECK-LIS-NEXT: adds.w lr, r0, r3 +; CHECK-LIS-NEXT: vmov r4, r2, d10 +; CHECK-LIS-NEXT: adc.w r12, r5, r8 +; CHECK-LIS-NEXT: vmov r5, r0, d8 +; CHECK-LIS-NEXT: adds r7, r7, r4 +; CHECK-LIS-NEXT: adcs r2, r6 +; CHECK-LIS-NEXT: adds r7, r7, r5 +; CHECK-LIS-NEXT: adc.w r8, r2, r0 +; CHECK-LIS-NEXT: vmov r6, r4, d1 +; CHECK-LIS-NEXT: vmov r2, r5, d9 +; CHECK-LIS-NEXT: vmov r3, r0, d0 +; CHECK-LIS-NEXT: adds r2, r2, r6 +; CHECK-LIS-NEXT: adc.w r6, r5, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d7 +; CHECK-LIS-NEXT: adds r2, r2, r5 +; CHECK-LIS-NEXT: adcs r6, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d2 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 ; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-LIS-NEXT: adds r4, r4, r5 -; CHECK-LIS-NEXT: vmov q0[2], q0[0], r4, lr -; CHECK-LIS-NEXT: adcs r0, r6 +; CHECK-LIS-NEXT: adds r3, r3, r5 +; CHECK-LIS-NEXT: adcs r0, r4 +; CHECK-LIS-NEXT: vmov r4, r5, d4 +; CHECK-LIS-NEXT: adds r3, r3, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-LIS-NEXT: adcs r0, r5 ; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-LIS-NEXT: vstrw.32 q0, [r1] ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} @@ -1194,87 +1020,46 @@ entry: } define void @vld3_v8f32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v8f32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v8f32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.f32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x float>, ptr %src, align 4 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> @@ -1287,155 +1072,80 @@ entry: } define void @vld3_v16f32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16f32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 -; CHECK-LV-NEXT: vmov.f32 s18, s10 -; CHECK-LV-NEXT: vmov.f32 s21, s8 -; CHECK-LV-NEXT: vmov.f32 s22, s11 -; CHECK-LV-NEXT: vmov.f32 s16, s12 -; CHECK-LV-NEXT: vmov.f32 s17, s15 -; CHECK-LV-NEXT: vmov.f32 s20, s13 -; CHECK-LV-NEXT: vmov.f32 s23, s26 -; CHECK-LV-NEXT: vmov.f32 s19, s25 -; CHECK-LV-NEXT: vadd.f32 q4, q4, q5 -; CHECK-LV-NEXT: vmov.f32 s8, s14 -; CHECK-LV-NEXT: vmov.f32 s10, s24 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LV-NEXT: vmov.f32 s11, s27 -; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-LV-NEXT: vadd.f32 q2, q4, q2 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LV-NEXT: vmov.f32 s25, s12 -; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LV-NEXT: vmov.f32 s26, s15 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s30, s14 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vmov.f32 s24, s17 -; CHECK-LV-NEXT: vmov.f32 s27, s22 -; CHECK-LV-NEXT: vmov.f32 s28, s16 -; CHECK-LV-NEXT: vmov.f32 s29, s19 -; CHECK-LV-NEXT: vmov.f32 s31, s21 -; CHECK-LV-NEXT: vadd.f32 q6, q7, q6 -; CHECK-LV-NEXT: vmov.f32 s12, s18 -; CHECK-LV-NEXT: vmov.f32 s14, s20 -; CHECK-LV-NEXT: vmov.f32 s15, s23 -; CHECK-LV-NEXT: vadd.f32 q3, q6, q3 -; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16f32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LIS-NEXT: vmov.f32 s18, s10 -; CHECK-LIS-NEXT: vmov.f32 s21, s8 -; CHECK-LIS-NEXT: vmov.f32 s22, s11 -; CHECK-LIS-NEXT: vmov.f32 s16, s12 -; CHECK-LIS-NEXT: vmov.f32 s17, s15 -; CHECK-LIS-NEXT: vmov.f32 s20, s13 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s19, s25 -; CHECK-LIS-NEXT: vmov.f32 s8, s14 -; CHECK-LIS-NEXT: vadd.f32 q4, q4, q5 -; CHECK-LIS-NEXT: vmov.f32 s10, s24 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LIS-NEXT: vmov.f32 s11, s27 -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] -; CHECK-LIS-NEXT: vadd.f32 q2, q4, q2 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s30, s14 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s28, s16 -; CHECK-LIS-NEXT: vmov.f32 s29, s19 -; CHECK-LIS-NEXT: vmov.f32 s31, s25 -; CHECK-LIS-NEXT: vadd.f32 q5, q7, q5 -; CHECK-LIS-NEXT: vmov.f32 s12, s18 -; CHECK-LIS-NEXT: vmov.f32 s14, s24 -; CHECK-LIS-NEXT: vmov.f32 s15, s27 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.f32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s23, s26 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vadd.f32 q2, q4, q2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vmov.f32 s31, s21 +; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vadd.f32 q3, q6, q3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x float>, ptr %src, align 4 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> @@ -1518,49 +1228,93 @@ entry: } define void @vld3_v8f16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s3, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s13, s8 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s8 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s3, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s11 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s10, s8 +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vmovx.f16 s8, s14 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s15 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vmovx.f16 s12, s1 +; CHECK-LV-NEXT: vins.f16 s13, s8 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s14, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vins.f16 s0, s12 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v8f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmovx.f16 s8, s8 +; CHECK-LIS-NEXT: vmovx.f16 s13, s3 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vins.f16 s3, s8 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vmovx.f16 s12, s16 +; CHECK-LIS-NEXT: vmovx.f16 s8, s11 +; CHECK-LIS-NEXT: vmovx.f16 s14, s10 +; CHECK-LIS-NEXT: vins.f16 s6, s12 +; CHECK-LIS-NEXT: vmovx.f16 s12, s0 +; CHECK-LIS-NEXT: vins.f16 s10, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s18 +; CHECK-LIS-NEXT: vmovx.f16 s15, s17 +; CHECK-LIS-NEXT: vins.f16 s12, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s19 +; CHECK-LIS-NEXT: vmovx.f16 s1, s1 +; CHECK-LIS-NEXT: vins.f16 s17, s8 +; CHECK-LIS-NEXT: vins.f16 s18, s2 +; CHECK-LIS-NEXT: vins.f16 s0, s1 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s14, s16 +; CHECK-LIS-NEXT: vins.f16 s15, s19 +; CHECK-LIS-NEXT: vins.f16 s13, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vmov.f32 s3, s17 +; CHECK-LIS-NEXT: vmov.f32 s7, s18 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q3 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x half>, ptr %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1573,86 +1327,167 @@ entry: } define void @vld3_v16f16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s7, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s2, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s13, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q2, q0, q1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s13 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmovx.f16 s11, s4 -; CHECK-NEXT: vins.f16 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s11, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmovx.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s19, s5 -; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vins.f16 s17, s13 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmovx.f16 s16, s15 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s7, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s8 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s11 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vins.f16 s10, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s14 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s13, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q2, q0, q1 +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-LV-NEXT: vmovx.f16 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s8, s1 +; CHECK-LV-NEXT: vins.f16 s8, s10 +; CHECK-LV-NEXT: vmovx.f16 s10, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s12 +; CHECK-LV-NEXT: vmovx.f16 s11, s4 +; CHECK-LV-NEXT: vins.f16 s9, s10 +; CHECK-LV-NEXT: vmov.f32 s10, s15 +; CHECK-LV-NEXT: vins.f16 s10, s11 +; CHECK-LV-NEXT: vmovx.f16 s16, s7 +; CHECK-LV-NEXT: vmov.f32 s11, s6 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s11, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s12 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s15 +; CHECK-LV-NEXT: vmovx.f16 s18, s14 +; CHECK-LV-NEXT: vins.f16 s14, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s6 +; CHECK-LV-NEXT: vmovx.f16 s19, s5 +; CHECK-LV-NEXT: vins.f16 s5, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s4 +; CHECK-LV-NEXT: vins.f16 s19, s7 +; CHECK-LV-NEXT: vins.f16 s17, s13 +; CHECK-LV-NEXT: vmov.f32 s2, s14 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q2 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v16f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s7, s12 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmovx.f16 s16, s15 +; CHECK-LIS-NEXT: vmov.f32 s7, s14 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s8 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s11 +; CHECK-LIS-NEXT: vmovx.f16 s18, s10 +; CHECK-LIS-NEXT: vins.f16 s10, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s14 +; CHECK-LIS-NEXT: vmovx.f16 s19, s13 +; CHECK-LIS-NEXT: vins.f16 s13, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s12 +; CHECK-LIS-NEXT: vins.f16 s19, s15 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vins.f16 s17, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-LIS-NEXT: vadd.f16 q1, q0, q1 +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s5, s12 +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s13 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s15 +; CHECK-LIS-NEXT: vmovx.f16 s7, s8 +; CHECK-LIS-NEXT: vmovx.f16 s16, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmov.f32 s7, s10 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s12 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s15 +; CHECK-LIS-NEXT: vmovx.f16 s18, s14 +; CHECK-LIS-NEXT: vins.f16 s14, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s10 +; CHECK-LIS-NEXT: vmovx.f16 s19, s9 +; CHECK-LIS-NEXT: vins.f16 s9, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s8 +; CHECK-LIS-NEXT: vins.f16 s19, s11 +; CHECK-LIS-NEXT: vins.f16 s17, s13 +; CHECK-LIS-NEXT: vmov.f32 s2, s14 +; CHECK-LIS-NEXT: vmov.f32 s3, s9 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x half>, ptr %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 6d6f9aca7188d..0c349c3aa8ec1 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -77,29 +77,32 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = SHL64ri [[COPY8]], 2, implicit-def dead $eflags ; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[COPY8]] :: (store (s64) into %stack.10) ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = LEA64r $noreg, 4, [[MOVSX64rr32_3]], 0, $noreg + ; CHECK-NEXT: MOV64mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.11) ; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.4) ; CHECK-NEXT: [[LEA64_32r3:%[0-9]+]]:gr32 = LEA64_32r [[COPY5]], 4, [[MOVSX64rm32_]], 0, $noreg - ; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r3]] :: (store (s32) into %stack.11) + ; CHECK-NEXT: MOV32mr %stack.12, 1, $noreg, 0, $noreg, [[LEA64_32r3]] :: (store (s32) into %stack.12) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.for.cond14.preheader: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.for.body17.lr.ph: ; CHECK-NEXT: successors: %bb.6(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm]], [[MOVSX64rr32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[MOV64rm]] :: (store (s64) into %stack.12) - ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) + ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13) + ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]] - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JMP_1 %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.for.cond.cleanup: @@ -108,12 +111,12 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: bb.5.for.cond.cleanup16: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm3]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) - ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) - ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm4]], implicit-def dead $eflags :: (store (s64) into %stack.9) - ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm3]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: CMP64rm [[MOV64rm3]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm5]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) + ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm6]], implicit-def dead $eflags :: (store (s64) into %stack.9) + ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm5]] :: (store (s64) into %stack.6) + ; CHECK-NEXT: CMP64rm [[MOV64rm5]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) ; CHECK-NEXT: JCC_1 %bb.2, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.4 ; CHECK-NEXT: {{ $}} @@ -121,11 +124,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit - ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm5]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[LEA64r1]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] @@ -134,9 +137,10 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]] ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm6]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]] ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]] ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]] @@ -144,15 +148,15 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]] - ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = COPY [[COPY11]] - ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]] + ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]] ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm1]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm2]], [[MOVSX64rr32_3]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm1]], [[LEA64r1]], implicit-def dead $eflags + ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[MOV64rm2]], [[MOV64rm7]], implicit-def $eflags + ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index 8699aaa505142..5969aae43f82e 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -339,9 +339,9 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx @@ -352,18 +352,18 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: negl %ecx ; X86-NEXT: cmovsl %esi, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: negl %esi -; X86-NEXT: cmovsl %ebx, %esi -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: negl %ebx -; X86-NEXT: cmovsl %ebp, %ebx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: cmovsl %edi, %ebp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovsl %edi, %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: negl %edi -; X86-NEXT: cmovsl %eax, %edi +; X86-NEXT: cmovsl %ebp, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negl %ebp +; X86-NEXT: cmovsl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: cmovsl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax @@ -375,9 +375,9 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 28(%edx) ; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl %edi, 20(%edx) +; X86-NEXT: movl %ebx, 20(%edx) ; X86-NEXT: movl %ebp, 16(%edx) -; X86-NEXT: movl %ebx, 12(%edx) +; X86-NEXT: movl %edi, 12(%edx) ; X86-NEXT: movl %esi, 8(%edx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 4(%edx) @@ -415,9 +415,9 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx @@ -428,18 +428,18 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: negw %cx ; X86-NEXT: cmovsw %si, %cx ; X86-NEXT: movw %cx, (%esp) # 2-byte Spill -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: negw %si -; X86-NEXT: cmovsw %bx, %si -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: negw %bx -; X86-NEXT: cmovsw %bp, %bx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: negw %bp -; X86-NEXT: cmovsw %di, %bp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovsw %di, %si +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: negw %di -; X86-NEXT: cmovsw %ax, %di +; X86-NEXT: cmovsw %bp, %di +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negw %bp +; X86-NEXT: cmovsw %bx, %bp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negw %bx +; X86-NEXT: cmovsw %ax, %bx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negw %ax @@ -451,9 +451,9 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movw %cx, 14(%edx) ; X86-NEXT: movw %ax, 12(%edx) -; X86-NEXT: movw %di, 10(%edx) +; X86-NEXT: movw %bx, 10(%edx) ; X86-NEXT: movw %bp, 8(%edx) -; X86-NEXT: movw %bx, 6(%edx) +; X86-NEXT: movw %di, 6(%edx) ; X86-NEXT: movw %si, 4(%edx) ; X86-NEXT: movzwl (%esp), %eax # 2-byte Folded Reload ; X86-NEXT: movw %ax, 2(%edx) @@ -486,7 +486,6 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-LABEL: test_v16i8: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb {{[0-9]+}}(%esp), %bl @@ -542,6 +541,12 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: xorb %al, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb %bh, %al ; X86-NEXT: sarb $7, %al @@ -572,40 +577,34 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: sarb $7, %al ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb %al, %ah -; X86-NEXT: sarb $7, %ah -; X86-NEXT: xorb %ah, %al -; X86-NEXT: subb %ah, %al -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb %al, 15(%esi) -; X86-NEXT: movb %cl, 14(%esi) -; X86-NEXT: movb %dl, 13(%esi) -; X86-NEXT: movb %ch, 12(%esi) -; X86-NEXT: movb %dh, 11(%esi) -; X86-NEXT: movb %bl, 10(%esi) -; X86-NEXT: movb %bh, 9(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 8(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 7(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 6(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 5(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 4(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 3(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 2(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 1(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, (%esi) -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %ch, 13(%eax) +; X86-NEXT: movb %dh, 12(%eax) +; X86-NEXT: movb %bl, 11(%eax) +; X86-NEXT: movb %bh, 10(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $12, %esp -; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 64e2afc1753cc..6d5f8a78cb1d7 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -5095,16 +5095,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: movl %eax, %ebx -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq 16(%rdi), %rcx +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: movq %rcx, %r9 +; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: movl %ecx, %r11d +; AVX-NEXT: movl %ecx, %ebx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: shrl $16, %ebx ; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX-NEXT: shrl $24, %r11d @@ -5115,74 +5115,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movq 24(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq 8(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $24, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $40, %rax +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $48, %rax +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrq $56, %rcx +; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -5197,16 +5197,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq 16(%rdi), %rcx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: movq %rcx, %r8 +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: movq %rcx, %r10 +; AVX2-NEXT: movl %ecx, %r11d +; AVX2-NEXT: movl %ecx, %ebx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrl $16, %ebx ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX2-NEXT: shrl $24, %r11d @@ -5217,74 +5217,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX2-NEXT: shrq $48, %r8 ; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 24(%rdi), %rcx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $24, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $40, %rax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movq 8(%rdi), %rax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq (%rdi), %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq 8(%rdi), %rcx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $40, %rax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $48, %rax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrq $56, %rcx +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d42b994357447..3e7d1138132c4 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1746,7 +1746,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d @@ -1758,56 +1758,56 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r13,%rbp), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r12,%rbp), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r15,%rbp), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r14,%rbp), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r11,%rbp), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r10,%rbp), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r9,%rbp), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r8,%rbp), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq %rdx, %rbp ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %rax -; SSE2-NEXT: adcq $-1, %rbp -; SSE2-NEXT: shldq $63, %rax, %rbp -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: movq %rbp, %xmm0 +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r13,%rcx), %r13 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r12,%rcx), %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r15,%rcx), %r15 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r14,%rcx), %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rbx,%rcx), %rbx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r11,%rcx), %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r10,%rcx), %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r9,%rcx), %r9 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r8,%rcx), %r8 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rdi,%rcx), %rdi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rsi,%rcx), %rsi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rax,%rcx), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: adcq $-1, %rcx +; SSE2-NEXT: shldq $63, %rdx, %rcx +; SSE2-NEXT: shldq $63, %rbp, %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: shrq %r13 ; SSE2-NEXT: movq %r13, %xmm3 ; SSE2-NEXT: shrq %r12 @@ -1825,14 +1825,14 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: shrq %r9 ; SSE2-NEXT: movq %r9, %xmm8 ; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm10 +; SSE2-NEXT: movq %r8, %xmm11 ; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm11 +; SSE2-NEXT: movq %rdi, %xmm12 ; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm12 +; SSE2-NEXT: movq %rsi, %xmm13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm13 +; SSE2-NEXT: movq %rax, %xmm10 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm14 @@ -1857,18 +1857,18 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm13, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1898,67 +1898,67 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $5, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %r10d -; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: vpextrw $6, %xmm0, %ebx +; AVX1-NEXT: vpextrw $7, %xmm0, %esi ; AVX1-NEXT: vpextrw $0, %xmm3, %edi ; AVX1-NEXT: vpextrw $1, %xmm3, %r8d ; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r11d -; AVX1-NEXT: vpextrw $4, %xmm3, %ebx +; AVX1-NEXT: vpextrw $3, %xmm3, %r10d +; AVX1-NEXT: vpextrw $4, %xmm3, %r11d ; AVX1-NEXT: vpextrw $5, %xmm3, %r14d ; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %esi -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d +; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: vpextrw $1, %xmm0, %eax ; AVX1-NEXT: vpextrw $0, %xmm0, %r12d ; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %r13, %rcx +; AVX1-NEXT: addq %rax, %rcx ; AVX1-NEXT: vpextrw $0, %xmm1, %eax ; AVX1-NEXT: addq %r12, %rax ; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi +; AVX1-NEXT: leaq -1(%rdx,%r12), %rdx ; AVX1-NEXT: vpextrw $6, %xmm2, %r12d ; AVX1-NEXT: leaq -1(%r15,%r12), %rbp ; AVX1-NEXT: vpextrw $5, %xmm2, %r15d ; AVX1-NEXT: leaq -1(%r14,%r15), %r13 ; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %ebx -; AVX1-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r9,%r11), %r14 +; AVX1-NEXT: leaq -1(%r11,%r14), %r12 +; AVX1-NEXT: vpextrw $3, %xmm2, %r11d +; AVX1-NEXT: leaq -1(%r10,%r11), %r15 +; AVX1-NEXT: vpextrw $2, %xmm2, %r10d +; AVX1-NEXT: leaq -1(%r9,%r10), %r14 ; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %rbx +; AVX1-NEXT: leaq -1(%r8,%r9), %r11 ; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r11 +; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 ; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rsi,%rdi), %r9 +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: leaq -1(%rbx,%rsi), %r8 +; AVX1-NEXT: vpextrw $5, %xmm1, %esi ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX1-NEXT: vpextrw $4, %xmm1, %edi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rbx,%rdi), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm0, %edi +; AVX1-NEXT: vpextrw $3, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm0, %edi +; AVX1-NEXT: vpextrw $2, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: xorl %edi, %edi ; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: adcq $-1, %rbx ; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %r10 -; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: adcq $-1, %rdi +; AVX1-NEXT: shldq $63, %rax, %rdi +; AVX1-NEXT: shldq $63, %rcx, %rbx +; AVX1-NEXT: shrq %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: shrq %rbp ; AVX1-NEXT: vmovq %rbp, %xmm1 ; AVX1-NEXT: shrq %r13 @@ -1969,21 +1969,21 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vmovq %r15, %xmm4 ; AVX1-NEXT: shrq %r14 ; AVX1-NEXT: vmovq %r14, %xmm5 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm6 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm7 +; AVX1-NEXT: vmovq %r11, %xmm6 +; AVX1-NEXT: shrq %r10 +; AVX1-NEXT: vmovq %r10, %xmm7 ; AVX1-NEXT: shrq %r9 ; AVX1-NEXT: vmovq %r9, %xmm8 ; AVX1-NEXT: shrq %r8 ; AVX1-NEXT: vmovq %r8, %xmm9 -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm10 +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm10 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %r10, %xmm12 -; AVX1-NEXT: vmovq %rdx, %xmm13 +; AVX1-NEXT: vmovq %rbx, %xmm12 +; AVX1-NEXT: vmovq %rdi, %xmm13 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm14 @@ -2030,143 +2030,143 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %r13 +; AVX2-NEXT: vmovq %xmm7, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rbp +; AVX2-NEXT: vmovq %xmm2, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r15 +; AVX2-NEXT: vpextrq $1, %xmm8, %r13 ; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %rbx -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: vpextrq $1, %xmm5, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rdi -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpextrq $1, %xmm7, %r15 +; AVX2-NEXT: vpextrq $1, %xmm6, %r12 +; AVX2-NEXT: vpextrq $1, %xmm4, %rbx +; AVX2-NEXT: vpextrq $1, %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %r11 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r11 -; AVX2-NEXT: addq %r15, %r11 +; AVX2-NEXT: vpextrq $1, %xmm9, %r9 +; AVX2-NEXT: addq %r13, %r9 +; AVX2-NEXT: movq %r9, %r13 ; AVX2-NEXT: vpextrq $1, %xmm8, %r9 ; AVX2-NEXT: addq %r14, %r9 ; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r9 -; AVX2-NEXT: addq %rbx, %r9 -; AVX2-NEXT: movq %r9, %rbx -; AVX2-NEXT: vpextrq $1, %xmm4, %r15 -; AVX2-NEXT: addq %rsi, %r15 -; AVX2-NEXT: vpextrq $1, %xmm5, %r12 -; AVX2-NEXT: addq %rdx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %r9 +; AVX2-NEXT: vpextrq $1, %xmm7, %r10 +; AVX2-NEXT: addq %r15, %r10 +; AVX2-NEXT: vpextrq $1, %xmm5, %r15 +; AVX2-NEXT: addq %r12, %r15 +; AVX2-NEXT: vpextrq $1, %xmm4, %r12 +; AVX2-NEXT: addq %rbx, %r12 +; AVX2-NEXT: vpextrq $1, %xmm3, %rbp +; AVX2-NEXT: addq %rdi, %rbp +; AVX2-NEXT: vpextrq $1, %xmm6, %r9 ; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: vmovq %xmm6, %rdi +; AVX2-NEXT: addq %rax, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r10, %rcx -; AVX2-NEXT: vmovq %xmm9, %r10 -; AVX2-NEXT: leaq -1(%r8,%r10), %rax +; AVX2-NEXT: addq %r11, %rcx +; AVX2-NEXT: vmovq %xmm9, %r11 +; AVX2-NEXT: leaq -1(%r8,%r11), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax +; AVX2-NEXT: vmovq %xmm8, %r8 +; AVX2-NEXT: leaq -1(%rdx,%r8), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: leaq -1(%r13,%rdi), %rax +; AVX2-NEXT: vmovq %xmm7, %rdx +; AVX2-NEXT: leaq -1(%rsi,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm4, %rdi +; AVX2-NEXT: vmovq %xmm5, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax +; AVX2-NEXT: leaq -1(%rax,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdi +; AVX2-NEXT: vmovq %xmm4, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax +; AVX2-NEXT: leaq -1(%rax,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rax +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vmovq %xmm3, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: vmovq %xmm2, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r11 -; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm2, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: adcq $-1, %rdx ; AVX2-NEXT: addq $-1, %r14 ; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: adcq $-1, %rsi +; AVX2-NEXT: addq $-1, %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 ; AVX2-NEXT: addq $-1, %r15 ; AVX2-NEXT: movl $0, %r10d ; AVX2-NEXT: adcq $-1, %r10 ; AVX2-NEXT: addq $-1, %r12 +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: adcq $-1, %rbx +; AVX2-NEXT: addq $-1, %rbp ; AVX2-NEXT: movl $0, %r14d ; AVX2-NEXT: adcq $-1, %r14 ; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rsi ; AVX2-NEXT: movl $0, %r13d ; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdx -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx +; AVX2-NEXT: addq $-1, %rdi +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: adcq $-1, %r11 ; AVX2-NEXT: addq $-1, %rcx ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdx, %rbx -; AVX2-NEXT: shldq $63, %rsi, %r13 -; AVX2-NEXT: shldq $63, %r9, %rbp -; AVX2-NEXT: shldq $63, %r12, %r14 +; AVX2-NEXT: shldq $63, %rdi, %r11 +; AVX2-NEXT: shldq $63, %r9, %r13 +; AVX2-NEXT: shldq $63, %rbp, %r14 +; AVX2-NEXT: shldq $63, %r12, %rbx ; AVX2-NEXT: shldq $63, %r15, %r10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r11 +; AVX2-NEXT: shldq $63, %rcx, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdi +; AVX2-NEXT: shldq $63, %rcx, %rsi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: vmovq %r8, %xmm0 +; AVX2-NEXT: shldq $63, %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rdi, %xmm2 +; AVX2-NEXT: vmovq %rsi, %xmm2 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r11, %xmm4 +; AVX2-NEXT: vmovq %r8, %xmm4 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm5 @@ -2174,16 +2174,16 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %r14, %xmm8 +; AVX2-NEXT: vmovq %rbx, %xmm8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rbp, %xmm10 +; AVX2-NEXT: vmovq %r14, %xmm10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm11 ; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %rbx, %xmm13 +; AVX2-NEXT: vmovq %r11, %xmm13 ; AVX2-NEXT: vmovq %rax, %xmm14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: shrq %rax @@ -2228,118 +2228,117 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %r9 -; AVX512-NEXT: vpextrq $1, %xmm3, %r10 +; AVX512-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r11 -; AVX512-NEXT: vpextrq $1, %xmm3, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpextrq $1, %xmm4, %r12 -; AVX512-NEXT: vpextrq $1, %xmm1, %r15 -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512-NEXT: vmovq %xmm3, %r13 +; AVX512-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-NEXT: vmovq %xmm5, %r8 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, %r9 +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: vpextrq $1, %xmm2, %rbx +; AVX512-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512-NEXT: vpextrq $1, %xmm3, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %r14 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm8, %rsi -; AVX512-NEXT: addq %rax, %rsi -; AVX512-NEXT: vpextrq $1, %xmm7, %rdx -; AVX512-NEXT: addq %r12, %rdx -; AVX512-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512-NEXT: addq %r15, %rcx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-NEXT: vpextrq $1, %xmm8, %rbp +; AVX512-NEXT: addq %rdx, %rbp +; AVX512-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512-NEXT: addq %rcx, %rdx +; AVX512-NEXT: vpextrq $1, %xmm3, %rcx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: vpextrq $1, %xmm2, %rax ; AVX512-NEXT: addq %r14, %rax ; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r13 +; AVX512-NEXT: leaq -1(%rbx,%r14), %r12 ; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r12 -; AVX512-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512-NEXT: leaq -1(%r10,%r11), %r15 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %r14 +; AVX512-NEXT: leaq -1(%r11,%rbx), %r15 +; AVX512-NEXT: vpextrq $1, %xmm7, %r11 +; AVX512-NEXT: leaq -1(%r10,%r11), %r14 +; AVX512-NEXT: vmovq %xmm7, %r10 +; AVX512-NEXT: leaq -1(%r9,%r10), %rbx ; AVX512-NEXT: vmovq %xmm8, %r9 ; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm7, %r8 +; AVX512-NEXT: vmovq %xmm4, %r8 ; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 ; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rbp,%rdi), %r9 -; AVX512-NEXT: vmovq %xmm6, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rdi -; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: vmovq %xmm3, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: leaq -1(%rsi,%rdi), %r9 +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: leaq -1(%r13,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm5, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm1, %rsi +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: vmovq %xmm2, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: addq $-1, %rsi +; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: adcq $-1, %rsi +; AVX512-NEXT: addq $-1, %rdx ; AVX512-NEXT: movl $0, %edi ; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: adcq $-1, %rbp ; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: adcq $-1, %r13 ; AVX512-NEXT: addq $-1, %rax ; AVX512-NEXT: adcq $-1, %r8 ; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %rbx -; AVX512-NEXT: shldq $63, %rdx, %rbp -; AVX512-NEXT: shldq $63, %rsi, %rdi -; AVX512-NEXT: shrq %r13 -; AVX512-NEXT: vmovq %r13, %xmm0 +; AVX512-NEXT: shldq $63, %rcx, %r13 +; AVX512-NEXT: shldq $63, %rdx, %rdi +; AVX512-NEXT: shldq $63, %rbp, %rsi ; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %r12, %xmm0 ; AVX512-NEXT: shrq %r15 -; AVX512-NEXT: vmovq %r15, %xmm2 +; AVX512-NEXT: vmovq %r15, %xmm1 ; AVX512-NEXT: shrq %r14 -; AVX512-NEXT: vmovq %r14, %xmm3 -; AVX512-NEXT: vmovq %rdi, %xmm4 +; AVX512-NEXT: vmovq %r14, %xmm2 +; AVX512-NEXT: shrq %rbx +; AVX512-NEXT: vmovq %rbx, %xmm3 +; AVX512-NEXT: vmovq %rsi, %xmm4 ; AVX512-NEXT: shrq %r11 ; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rbp, %xmm6 +; AVX512-NEXT: vmovq %rdi, %xmm6 ; AVX512-NEXT: shrq %r10 ; AVX512-NEXT: vmovq %r10, %xmm7 ; AVX512-NEXT: shrq %r9 @@ -2353,7 +2352,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %rbx, %xmm12 +; AVX512-NEXT: vmovq %r13, %xmm12 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm13 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 9411aad9a21e4..b39b089faa2a5 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -910,27 +910,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %r10d, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $6, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftrw $7, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r11d -; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $9, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftrw $10, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $11, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftrw $12, %k0, %k1 @@ -938,40 +938,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movb %cl, 2(%rax) -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx ; KNL-NEXT: andl $1, %edx -; KNL-NEXT: leal (%rcx,%rdx,2), %ecx +; KNL-NEXT: movb %dl, 2(%rax) +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: andl $1, %edx +; KNL-NEXT: andl $1, %r9d +; KNL-NEXT: leal (%rdx,%r9,2), %r9d ; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: leal (%r9,%r8,4), %r9d +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: leal (%rcx,%rsi,4), %ecx -; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: leal (%r9,%rsi,8), %esi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: leal (%rcx,%rdi,8), %ecx -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: shll $4, %r9d -; KNL-NEXT: orl %ecx, %r9d -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: shll $5, %r8d -; KNL-NEXT: orl %r9d, %r8d +; KNL-NEXT: shll $4, %edi +; KNL-NEXT: orl %esi, %edi +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: shll $5, %ecx +; KNL-NEXT: orl %edi, %ecx ; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: shll $6, %r10d -; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: shll $7, %r11d -; KNL-NEXT: orl %r10d, %r11d ; KNL-NEXT: andl $1, %ebx -; KNL-NEXT: shll $8, %ebx -; KNL-NEXT: orl %r11d, %ebx +; KNL-NEXT: shll $7, %ebx +; KNL-NEXT: orl %r10d, %ebx +; KNL-NEXT: andl $1, %ebp +; KNL-NEXT: shll $8, %ebp +; KNL-NEXT: orl %ebx, %ebp ; KNL-NEXT: andl $1, %r14d ; KNL-NEXT: shll $9, %r14d -; KNL-NEXT: orl %ebx, %r14d -; KNL-NEXT: andl $1, %ebp -; KNL-NEXT: shll $10, %ebp -; KNL-NEXT: orl %r14d, %ebp -; KNL-NEXT: orl %r8d, %ebp +; KNL-NEXT: orl %ebp, %r14d +; KNL-NEXT: andl $1, %r11d +; KNL-NEXT: shll $10, %r11d +; KNL-NEXT: orl %r14d, %r11d +; KNL-NEXT: orl %ecx, %r11d ; KNL-NEXT: andl $1, %r15d ; KNL-NEXT: shll $11, %r15d ; KNL-NEXT: andl $1, %r12d @@ -983,11 +983,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: shll $14, %edx ; KNL-NEXT: orl %r13d, %edx -; KNL-NEXT: andl $1, %esi -; KNL-NEXT: shll $15, %esi -; KNL-NEXT: orl %edx, %esi -; KNL-NEXT: orl %ebp, %esi -; KNL-NEXT: movw %si, (%rax) +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: shll $15, %r8d +; KNL-NEXT: orl %edx, %r8d +; KNL-NEXT: orl %r11d, %r8d +; KNL-NEXT: movw %r8w, (%rax) ; KNL-NEXT: popq %rbx ; KNL-NEXT: popq %r12 ; KNL-NEXT: popq %r13 @@ -1085,16 +1085,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k6 +; SKX-NEXT: kandd %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandd %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 @@ -1190,14 +1190,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload -; SKX-NEXT: kandd %k7, %k0, %k0 +; SKX-NEXT: kandd %k6, %k0, %k0 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 ; SKX-NEXT: kord %k1, %k0, %k0 -; SKX-NEXT: kandd %k6, %k0, %k0 -; SKX-NEXT: kshiftld $31, %k7, %k1 +; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload +; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k6, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kandd %k5, %k0, %k0 @@ -1223,27 +1223,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: kshiftrd $1, %k0, %k1 ; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kshiftrd $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r9d ; SKX-NEXT: kshiftrd $2, %k0, %k1 -; SKX-NEXT: kmovd %k1, %esi +; SKX-NEXT: kmovd %k1, %r8d ; SKX-NEXT: kshiftrd $3, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kmovd %k1, %esi ; SKX-NEXT: kshiftrd $4, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrd $5, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kshiftrd $6, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r10d ; SKX-NEXT: kshiftrd $7, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r11d -; SKX-NEXT: kshiftrd $8, %k0, %k1 ; SKX-NEXT: kmovd %k1, %ebx +; SKX-NEXT: kshiftrd $8, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ebp ; SKX-NEXT: kshiftrd $9, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r14d ; SKX-NEXT: kshiftrd $10, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ebp +; SKX-NEXT: kmovd %k1, %r11d ; SKX-NEXT: kshiftrd $11, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r15d ; SKX-NEXT: kshiftrd $12, %k0, %k1 @@ -1251,40 +1251,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r13d ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movb %cl, 2(%rax) -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx ; SKX-NEXT: andl $1, %edx -; SKX-NEXT: leal (%rcx,%rdx,2), %ecx +; SKX-NEXT: movb %dl, 2(%rax) +; SKX-NEXT: kmovd %k0, %edx +; SKX-NEXT: andl $1, %edx +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: leal (%rdx,%r9,2), %r9d ; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrd $15, %k0, %k0 +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: leal (%r9,%r8,4), %r9d +; SKX-NEXT: kmovd %k0, %r8d ; SKX-NEXT: andl $1, %esi -; SKX-NEXT: leal (%rcx,%rsi,4), %ecx -; SKX-NEXT: kmovd %k0, %esi +; SKX-NEXT: leal (%r9,%rsi,8), %esi ; SKX-NEXT: andl $1, %edi -; SKX-NEXT: leal (%rcx,%rdi,8), %ecx -; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: shll $4, %r9d -; SKX-NEXT: orl %ecx, %r9d -; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: shll $5, %r8d -; SKX-NEXT: orl %r9d, %r8d +; SKX-NEXT: shll $4, %edi +; SKX-NEXT: orl %esi, %edi +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: shll $5, %ecx +; SKX-NEXT: orl %edi, %ecx ; SKX-NEXT: andl $1, %r10d ; SKX-NEXT: shll $6, %r10d -; SKX-NEXT: andl $1, %r11d -; SKX-NEXT: shll $7, %r11d -; SKX-NEXT: orl %r10d, %r11d ; SKX-NEXT: andl $1, %ebx -; SKX-NEXT: shll $8, %ebx -; SKX-NEXT: orl %r11d, %ebx +; SKX-NEXT: shll $7, %ebx +; SKX-NEXT: orl %r10d, %ebx +; SKX-NEXT: andl $1, %ebp +; SKX-NEXT: shll $8, %ebp +; SKX-NEXT: orl %ebx, %ebp ; SKX-NEXT: andl $1, %r14d ; SKX-NEXT: shll $9, %r14d -; SKX-NEXT: orl %ebx, %r14d -; SKX-NEXT: andl $1, %ebp -; SKX-NEXT: shll $10, %ebp -; SKX-NEXT: orl %r14d, %ebp -; SKX-NEXT: orl %r8d, %ebp +; SKX-NEXT: orl %ebp, %r14d +; SKX-NEXT: andl $1, %r11d +; SKX-NEXT: shll $10, %r11d +; SKX-NEXT: orl %r14d, %r11d +; SKX-NEXT: orl %ecx, %r11d ; SKX-NEXT: andl $1, %r15d ; SKX-NEXT: shll $11, %r15d ; SKX-NEXT: andl $1, %r12d @@ -1296,11 +1296,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: shll $14, %edx ; SKX-NEXT: orl %r13d, %edx -; SKX-NEXT: andl $1, %esi -; SKX-NEXT: shll $15, %esi -; SKX-NEXT: orl %edx, %esi -; SKX-NEXT: orl %ebp, %esi -; SKX-NEXT: movw %si, (%rax) +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: shll $15, %r8d +; SKX-NEXT: orl %edx, %r8d +; SKX-NEXT: orl %r11d, %r8d +; SKX-NEXT: movw %r8w, (%rax) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r12 ; SKX-NEXT: popq %r13 @@ -1726,16 +1726,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; FASTISEL-NEXT: kandd %k1, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k6 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; FASTISEL-NEXT: kmovd %edi, %k6 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k1 +; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 @@ -1831,14 +1831,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload -; FASTISEL-NEXT: kandd %k7, %k0, %k0 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 ; FASTISEL-NEXT: kord %k1, %k0, %k0 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 -; FASTISEL-NEXT: kshiftld $31, %k7, %k1 +; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload +; FASTISEL-NEXT: kandd %k1, %k0, %k0 +; FASTISEL-NEXT: kshiftld $31, %k6, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kandd %k5, %k0, %k0 @@ -1864,27 +1864,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kshiftrd $16, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ecx -; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 +; FASTISEL-NEXT: kmovd %k1, %r9d ; FASTISEL-NEXT: kshiftrd $2, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %esi +; FASTISEL-NEXT: kmovd %k1, %r8d ; FASTISEL-NEXT: kshiftrd $3, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edi +; FASTISEL-NEXT: kmovd %k1, %esi ; FASTISEL-NEXT: kshiftrd $4, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrd $5, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r8d +; FASTISEL-NEXT: kmovd %k1, %ecx ; FASTISEL-NEXT: kshiftrd $6, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r10d ; FASTISEL-NEXT: kshiftrd $7, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r11d -; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %ebx +; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 +; FASTISEL-NEXT: kmovd %k1, %ebp ; FASTISEL-NEXT: kshiftrd $9, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r14d ; FASTISEL-NEXT: kshiftrd $10, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ebp +; FASTISEL-NEXT: kmovd %k1, %r11d ; FASTISEL-NEXT: kshiftrd $11, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r15d ; FASTISEL-NEXT: kshiftrd $12, %k0, %k1 @@ -1892,40 +1892,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r13d ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %ecx -; FASTISEL-NEXT: movb %cl, 2(%rax) -; FASTISEL-NEXT: kmovd %k0, %ecx -; FASTISEL-NEXT: andl $1, %ecx ; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: leal (%rcx,%rdx,2), %ecx +; FASTISEL-NEXT: movb %dl, 2(%rax) +; FASTISEL-NEXT: kmovd %k0, %edx +; FASTISEL-NEXT: andl $1, %edx +; FASTISEL-NEXT: andl $1, %r9d +; FASTISEL-NEXT: leal (%rdx,%r9,2), %r9d ; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 +; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: leal (%r9,%r8,4), %r9d +; FASTISEL-NEXT: kmovd %k0, %r8d ; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: leal (%rcx,%rsi,4), %ecx -; FASTISEL-NEXT: kmovd %k0, %esi +; FASTISEL-NEXT: leal (%r9,%rsi,8), %esi ; FASTISEL-NEXT: andl $1, %edi -; FASTISEL-NEXT: leal (%rcx,%rdi,8), %ecx -; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: shll $4, %r9d -; FASTISEL-NEXT: orl %ecx, %r9d -; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: shll $5, %r8d -; FASTISEL-NEXT: orl %r9d, %r8d +; FASTISEL-NEXT: shll $4, %edi +; FASTISEL-NEXT: orl %esi, %edi +; FASTISEL-NEXT: andl $1, %ecx +; FASTISEL-NEXT: shll $5, %ecx +; FASTISEL-NEXT: orl %edi, %ecx ; FASTISEL-NEXT: andl $1, %r10d ; FASTISEL-NEXT: shll $6, %r10d -; FASTISEL-NEXT: andl $1, %r11d -; FASTISEL-NEXT: shll $7, %r11d -; FASTISEL-NEXT: orl %r10d, %r11d ; FASTISEL-NEXT: andl $1, %ebx -; FASTISEL-NEXT: shll $8, %ebx -; FASTISEL-NEXT: orl %r11d, %ebx +; FASTISEL-NEXT: shll $7, %ebx +; FASTISEL-NEXT: orl %r10d, %ebx +; FASTISEL-NEXT: andl $1, %ebp +; FASTISEL-NEXT: shll $8, %ebp +; FASTISEL-NEXT: orl %ebx, %ebp ; FASTISEL-NEXT: andl $1, %r14d ; FASTISEL-NEXT: shll $9, %r14d -; FASTISEL-NEXT: orl %ebx, %r14d -; FASTISEL-NEXT: andl $1, %ebp -; FASTISEL-NEXT: shll $10, %ebp -; FASTISEL-NEXT: orl %r14d, %ebp -; FASTISEL-NEXT: orl %r8d, %ebp +; FASTISEL-NEXT: orl %ebp, %r14d +; FASTISEL-NEXT: andl $1, %r11d +; FASTISEL-NEXT: shll $10, %r11d +; FASTISEL-NEXT: orl %r14d, %r11d +; FASTISEL-NEXT: orl %ecx, %r11d ; FASTISEL-NEXT: andl $1, %r15d ; FASTISEL-NEXT: shll $11, %r15d ; FASTISEL-NEXT: andl $1, %r12d @@ -1937,11 +1937,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: shll $14, %edx ; FASTISEL-NEXT: orl %r13d, %edx -; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: shll $15, %esi -; FASTISEL-NEXT: orl %edx, %esi -; FASTISEL-NEXT: orl %ebp, %esi -; FASTISEL-NEXT: movw %si, (%rax) +; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: shll $15, %r8d +; FASTISEL-NEXT: orl %edx, %r8d +; FASTISEL-NEXT: orl %r11d, %r8d +; FASTISEL-NEXT: movw %r8w, (%rax) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: popq %r12 ; FASTISEL-NEXT: popq %r13 @@ -2380,9 +2380,8 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: movb $-9, %dil -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k6 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 @@ -2403,13 +2402,14 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $2, %k4, %k4 ; SKX-NEXT: korb %k4, %k0, %k0 ; SKX-NEXT: movb $-65, %dil -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kmovq %k1, %k2 +; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: korb %k5, %k0, %k0 -; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: korb %k5, %k0, %k5 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k0 @@ -2417,15 +2417,14 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $7, %k7, %k7 ; SKX-NEXT: korb %k0, %k7, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: korb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k1 +; SKX-NEXT: kandb %k6, %k0, %k1 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $4, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 @@ -2434,19 +2433,18 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 @@ -2455,29 +2453,31 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovq %k4, %k2 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k5, %k7 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kmovq %k6, %k7 +; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kandb %k3, %k0, %k0 -; SKX-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k5, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 @@ -2515,27 +2515,26 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: korb %k5, %k1, %k5 -; SKX-NEXT: kshiftlb $7, %k7, %k1 -; SKX-NEXT: kshiftrb $6, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kshiftlb $7, %k7, %k7 -; SKX-NEXT: kshiftrb $7, %k7, %k7 -; SKX-NEXT: korb %k1, %k7, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kandb %k2, %k1, %k1 -; SKX-NEXT: kshiftlb $7, %k7, %k7 -; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; SKX-NEXT: korb %k7, %k1, %k1 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: korb %k5, %k1, %k7 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k5, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 @@ -2545,13 +2544,12 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; SKX-NEXT: kandb %k5, %k0, %k0 @@ -2563,29 +2561,27 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k7, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 -; SKX-NEXT: kmovq %k4, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kmovq %k2, %k3 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2598,30 +2594,28 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovq %k5, %k4 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovq %k2, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k6, %k0, %k2 +; SKX-NEXT: kandb %k3, %k0, %k2 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: korb %k1, %k2, %k1 -; SKX-NEXT: kmovq %k7, %k2 -; SKX-NEXT: kandb %k7, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $2, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2636,32 +2630,28 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k4, %k0, %k0 -; SKX-NEXT: kmovq %k4, %k7 +; SKX-NEXT: kandb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k5, %k3 ; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kmovq %k2, %k5 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 @@ -2677,22 +2667,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kandb %k5, %k1, %k1 ; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k6, %k1, %k1 +; SKX-NEXT: kandb %k3, %k1, %k1 ; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k5, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k3, %k2 ; SKX-NEXT: kshiftrb $2, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k4, %k1, %k1 +; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 @@ -3189,129 +3179,131 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k0, %k0 ; FASTISEL-NEXT: movb $-33, %dil -; FASTISEL-NEXT: kmovd %edi, %k5 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k6 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; FASTISEL-NEXT: kshiftlb $7, %k4, %k4 ; FASTISEL-NEXT: kshiftrb $2, %k4, %k4 ; FASTISEL-NEXT: korb %k4, %k0, %k0 ; FASTISEL-NEXT: movb $-65, %dil -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kandb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovq %k1, %k4 -; FASTISEL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $6, %k6, %k6 +; FASTISEL-NEXT: kmovd %edi, %k2 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 -; FASTISEL-NEXT: korb %k6, %k7, %k6 -; FASTISEL-NEXT: kandb %k3, %k6, %k6 +; FASTISEL-NEXT: korb %k0, %k7, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovq %k3, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 -; FASTISEL-NEXT: korb %k7, %k6, %k6 +; FASTISEL-NEXT: korb %k7, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k2, %k6, %k6 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k4, %k0, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $4, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: korb %k7, %k6, %k6 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k6, %k6 +; FASTISEL-NEXT: korb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $3, %k0, %k0 -; FASTISEL-NEXT: korb %k0, %k6, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $2, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k0 -; FASTISEL-NEXT: kandb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 -; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovq %k3, %k7 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k2, %k6 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k7, %k0, %k0 -; FASTISEL-NEXT: kmovq %k7, %k5 +; FASTISEL-NEXT: kmovq %k5, %k2 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k4, %k7 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k1 +; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 +; FASTISEL-NEXT: korb %k1, %k5, %k1 +; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kandb %k6, %k1, %k1 +; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: korb %k1, %k0, %k2 +; FASTISEL-NEXT: korb %k5, %k1, %k5 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 ; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 @@ -3319,35 +3311,36 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 ; FASTISEL-NEXT: korb %k1, %k7, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k2, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: korb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $4, %k0, %k0 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k2, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kshiftlb $7, %k1, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 @@ -3356,69 +3349,68 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovq %k5, %k3 -; FASTISEL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k7, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k1 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovq %k4, %k5 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k2, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $7, %k2, %k2 +; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 +; FASTISEL-NEXT: korb %k0, %k1, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kandb %k2, %k0, %k2 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: korb %k1, %k2, %k1 -; FASTISEL-NEXT: kandb %k3, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $5, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovq %k7, %k3 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k6, %k1, %k1 -; FASTISEL-NEXT: kmovq %k6, %k5 -; FASTISEL-NEXT: kshiftlb $7, %k7, %k2 -; FASTISEL-NEXT: kshiftrb $2, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 ; FASTISEL-NEXT: kandb %k4, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $1, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k0, %k1, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $2, %k0, %k0 +; FASTISEL-NEXT: korb %k0, %k1, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 @@ -3427,8 +3419,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 @@ -3438,19 +3429,18 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k7, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 @@ -3461,7 +3451,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $7, %k2, %k2 ; FASTISEL-NEXT: korb %k1, %k2, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kandb %k6, %k1, %k1 ; FASTISEL-NEXT: kmovd %ecx, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $5, %k2, %k2 @@ -3471,17 +3461,17 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k6, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 ; FASTISEL-NEXT: kmovd %r9d, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k3, %k2 ; FASTISEL-NEXT: kshiftrb $2, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k4, %k1, %k1 +; FASTISEL-NEXT: kandb %k5, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $1, %k2, %k2 diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 0caa8826e75c8..7a534721bae05 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -941,44 +941,43 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; X32-NEXT: pushl %ebx ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %edi -; X32-NEXT: leal (%edx,%esi), %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: leal (%edx,%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: subl %esi, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: subl %edi, %eax +; X32-NEXT: movl %ebp, %edx +; X32-NEXT: subl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %edx, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, %edx +; X32-NEXT: subl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %eax, %edx +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: subl %ebx, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebx, %eax -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %edi, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %ebp, %edi +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %edx, %ebp ; X32-NEXT: addl {{[0-9]+}}(%esp), %esi ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %eax, %edx -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl %eax, %ebp +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %ebx ; X32-NEXT: popl %ebp @@ -986,6 +985,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -998,35 +998,36 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r10d, %r9d -; WIN64-NEXT: movl %eax, %r10d -; WIN64-NEXT: subl %ecx, %r10d -; WIN64-NEXT: imull %r10d, %r9d -; WIN64-NEXT: leal (%r11,%r12), %r10d -; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 -; WIN64-NEXT: subl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d -; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %r9d -; WIN64-NEXT: subl %r15d, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: leal (%r9,%r10), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r10d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r11,%r12), %r9d +; WIN64-NEXT: movl %r11d, %r10d +; WIN64-NEXT: subl %r12d, %r10d +; WIN64-NEXT: imull %ebp, %r10d +; WIN64-NEXT: addl %r8d, %r10d +; WIN64-NEXT: leal (%r14,%r15), %r8d +; WIN64-NEXT: movl %r14d, %r11d +; WIN64-NEXT: subl %r15d, %r11d +; WIN64-NEXT: imull %esi, %r11d +; WIN64-NEXT: addl %r10d, %r11d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r10d -; WIN64-NEXT: addl %r10d, %eax -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: imull %edx, %r8d +; WIN64-NEXT: addl %r8d, %eax +; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: testi32_inp: @@ -1040,35 +1041,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX64-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX64-NEXT: subl %edi, %edx -; LINUXOSX64-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX64-NEXT: movl %edx, %r11d +; LINUXOSX64-NEXT: subl %edi, %r11d +; LINUXOSX64-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX64-NEXT: subl %r8d, %esi -; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX64-NEXT: subl %r12d, %r9d -; LINUXOSX64-NEXT: movl %eax, %r11d -; LINUXOSX64-NEXT: subl %ecx, %r11d -; LINUXOSX64-NEXT: imull %r11d, %r9d -; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX64-NEXT: leal (%r9,%r12), %edi +; LINUXOSX64-NEXT: movl %r9d, %r8d +; LINUXOSX64-NEXT: subl %r12d, %r8d +; LINUXOSX64-NEXT: movl %eax, %r9d +; LINUXOSX64-NEXT: subl %ecx, %r9d +; LINUXOSX64-NEXT: imull %r9d, %r8d +; LINUXOSX64-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX64-NEXT: movl %r13d, %r12d ; LINUXOSX64-NEXT: subl %r14d, %r12d -; LINUXOSX64-NEXT: imull %edx, %r12d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX64-NEXT: addl %r9d, %r12d -; LINUXOSX64-NEXT: movl %r15d, %r9d -; LINUXOSX64-NEXT: subl %edx, %r9d -; LINUXOSX64-NEXT: imull %esi, %r9d -; LINUXOSX64-NEXT: addl %r12d, %r9d +; LINUXOSX64-NEXT: imull %r11d, %r12d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX64-NEXT: addl %r8d, %r12d +; LINUXOSX64-NEXT: movl %r15d, %r8d +; LINUXOSX64-NEXT: subl %r11d, %r8d +; LINUXOSX64-NEXT: imull %esi, %r8d +; LINUXOSX64-NEXT: addl %r12d, %r8d ; LINUXOSX64-NEXT: addl %ecx, %eax -; LINUXOSX64-NEXT: imull %r8d, %eax -; LINUXOSX64-NEXT: imull %r10d, %r11d -; LINUXOSX64-NEXT: addl %r11d, %eax -; LINUXOSX64-NEXT: addl %r15d, %edx -; LINUXOSX64-NEXT: imull %edi, %edx -; LINUXOSX64-NEXT: addl %edx, %eax +; LINUXOSX64-NEXT: imull %edi, %eax +; LINUXOSX64-NEXT: imull %r10d, %r9d ; LINUXOSX64-NEXT: addl %r9d, %eax +; LINUXOSX64-NEXT: addl %r15d, %r11d +; LINUXOSX64-NEXT: imull %edx, %r11d +; LINUXOSX64-NEXT: addl %r11d, %eax +; LINUXOSX64-NEXT: addl %r8d, %eax ; LINUXOSX64-NEXT: retq %x1 = sub i32 %a1, %a2 %x2 = sub i32 %a3, %a4 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9cb3ceae16f09..51ffeca52a665 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1941,45 +1941,47 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_cmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] ; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2114,45 +2116,47 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] ; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index bcae88259a92e..9daac1df1d975 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -710,7 +710,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $56, %esp +; X86-NEXT: subl $60, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -733,8 +733,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 ; X86-NEXT: shrl %ebx ; X86-NEXT: andl $1431633920, %ebx # imm = 0x55550000 -; X86-NEXT: leal (%ebx,%ebp,2), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: leal (%ebx,%ebp,2), %ebp ; X86-NEXT: bswapl %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F @@ -751,7 +750,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555 ; X86-NEXT: shrl %edi ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NEXT: leal (%edi,%ebx,2), %ebx +; X86-NEXT: leal (%edi,%ebx,2), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F @@ -768,8 +768,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 ; X86-NEXT: shrl %esi ; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 -; X86-NEXT: leal (%esi,%edi,2), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%esi,%edi,2), %ebx ; X86-NEXT: bswapl %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F @@ -898,8 +897,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%eax,%ecx,2), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -937,7 +935,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1012,7 +1010,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %edi +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1031,16 +1030,11 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %edx -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: shrdl $16, %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: shrdl $16, %ecx, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrdl $16, %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shrdl $16, %ecx, %eax @@ -1060,25 +1054,30 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl $16, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: shrdl $16, %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: shrdl $16, %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: shrdl $16, %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ebx -; X86-NEXT: shrdl $16, %edi, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shrdl $16, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shrdl $16, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %edi +; X86-NEXT: shrdl $16, %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl %ecx, 56(%eax) +; X86-NEXT: movl %ecx, 60(%eax) +; X86-NEXT: movl %edi, 56(%eax) ; X86-NEXT: movl %ebx, 52(%eax) ; X86-NEXT: movl %ebp, 48(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 40(%eax) @@ -1098,12 +1097,12 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: shrl $16, %edx ; X86-NEXT: movw %dx, 64(%eax) -; X86-NEXT: addl $56, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll index dc8c0e13edcaa..8e43ae438f2ae 100644 --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -142,16 +142,17 @@ define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { ; ; SSE2-LIS-LABEL: combine_vec_rot_select_zero: ; SSE2-LIS: # %bb.0: -; SSE2-LIS-NEXT: pxor %xmm2, %xmm2 -; SSE2-LIS-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-LIS-NEXT: movdqa %xmm0, %xmm2 +; SSE2-LIS-NEXT: pxor %xmm0, %xmm0 +; SSE2-LIS-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-LIS-NEXT: pslld $23, %xmm1 ; SSE2-LIS-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-LIS-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-LIS-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-LIS-NEXT: movdqa %xmm0, %xmm3 +; SSE2-LIS-NEXT: movdqa %xmm2, %xmm3 ; SSE2-LIS-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-LIS-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] @@ -160,10 +161,9 @@ define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { ; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-LIS-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-LIS-NEXT: por %xmm4, %xmm3 -; SSE2-LIS-NEXT: pand %xmm2, %xmm0 -; SSE2-LIS-NEXT: pandn %xmm3, %xmm2 -; SSE2-LIS-NEXT: por %xmm0, %xmm2 -; SSE2-LIS-NEXT: movdqa %xmm2, %xmm0 +; SSE2-LIS-NEXT: pand %xmm0, %xmm2 +; SSE2-LIS-NEXT: pandn %xmm3, %xmm0 +; SSE2-LIS-NEXT: por %xmm2, %xmm0 ; SSE2-LIS-NEXT: retq ; ; XOP-LABEL: combine_vec_rot_select_zero: diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 222a4d78668b6..3efd536adc4d1 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -51,26 +51,26 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb %al ; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 7ce11ad6abaf3..d26f4b7044cf3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,104 +177,103 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $156, %esp +; X86-NEXT: subl $152, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edi, %ebx -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %ebp, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: subl %edx, %ebp +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %ebx, %ebx -; X86-NEXT: xorl $31, %ebx -; X86-NEXT: addl $32, %ebx ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnel %edx, %ebx -; X86-NEXT: addl $64, %ebx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bsrl %ebp, %ebp +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: addl $32, %ebp ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edi -; X86-NEXT: cmovnel %ecx, %ebx +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: addl $64, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovnel %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: orl %edi, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: subl %edx, %ebx +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subl %edx, %ebp ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $0, %edx @@ -282,8 +281,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx @@ -291,36 +291,35 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: cmovnel %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: cmovnel %ebp, %edi -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: cmovnel %ebp, %esi -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: xorl $127, %ebp +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -329,89 +328,93 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 148(%esp,%eax), %edx -; X86-NEXT: movl 152(%esp,%eax), %ebx +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 144(%esp,%edi), %edx +; X86-NEXT: movl 148(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 144(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: shrl %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl 140(%esp,%eax), %eax +; X86-NEXT: movl 140(%esp,%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrl %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 136(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %eax, %ebp -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl $1, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: +; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: jmp .LBB4_9 ; X86-NEXT: .LBB4_2: # %udiv-preheader +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %al, %ch +; X86-NEXT: movb %bl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 104(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movb %bl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: movl 100(%esp,%ebx), %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%ebx), %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl 92(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: shrdl %cl, %esi, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl 88(%esp,%ebx), %esi +; X86-NEXT: movl 92(%esp,%ebx), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl -; X86-NEXT: addl %ebx, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -421,179 +424,177 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $1, %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %ebp +; X86-NEXT: shldl $1, %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl $1, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi ; X86-NEXT: adcl $-1, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: addl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: shldl $1, %eax, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: xorl %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: subl %eax, %esi +; X86-NEXT: shldl $1, %esi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl %ebp, %esi +; X86-NEXT: .LBB4_9: # %udiv-end +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: subl %ebx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %ebp, 4(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 12(%edx) -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebx +; X86-NEXT: imull %edx, %ebp ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $156, %esp +; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -607,24 +608,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%rbx) -; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: imulq %rax, %r14 -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rdx -; X64-NEXT: imulq %r15, %rcx +; X64-NEXT: movq %rdx, 8(%r15) +; X64-NEXT: movq %rax, (%r15) +; X64-NEXT: imulq %rax, %r12 +; X64-NEXT: mulq %r13 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: imulq %r13, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r13 -; X64-NEXT: sbbq %rcx, %r12 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %rdx +; X64-NEXT: subq %rax, %r14 +; X64-NEXT: sbbq %rcx, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -713,23 +714,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movsbl (%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm4 +; X86-NEXT: movd %edx, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm7 +; X86-NEXT: movd %esi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm5 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-NEXT: movdqa %xmm2, %xmm4 ; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; X86-NEXT: movdqa %xmm4, (%ecx) @@ -816,47 +817,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm2 +; X64-NEXT: movd %r10d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %r15d, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X64-NEXT: movd %edx, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm2, (%rax) +; X64-NEXT: movdqa %xmm4, (%rax) +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm2, %xmm4 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm2, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm6, %xmm1 -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: packuswb %xmm2, %xmm1 +; X64-NEXT: pmullw %xmm3, %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: packuswb %xmm4, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 454f8b7242dc2..ebb95f16a723c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -178,13 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $132, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sete %bl @@ -197,29 +197,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movb %al, (%esp) # 1-byte Spill ; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: bsrl %edi, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bsrl %eax, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: addl $32, %eax -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: addl $64, %eax -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %ebp, %ebp +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: addl $32, %ebp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: addl $64, %ebp +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: cmovnel %ecx, %ebp +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: bsrl %edi, %esi @@ -228,67 +230,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %edi, %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: orl %ebp, %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %edx, %ebp ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $127, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebp, %edx ; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %ebx, %edx ; X86-NEXT: setb %dl ; X86-NEXT: orb (%esp), %dl # 1-byte Folded Reload -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovnel %ecx, %ebp ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: xorl $127, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: je .LBB4_8 ; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch @@ -297,69 +301,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax ; X86-NEXT: movl 124(%esp,%eax), %edx -; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 128(%esp,%eax), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 120(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shrl %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl 116(%esp,%eax), %edx +; X86-NEXT: movl 120(%esp,%eax), %ebp +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: shrl %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl 116(%esp,%eax), %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: addl $1, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shldl %cl, %ebp, %edx +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: addl $1, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 80(%esp,%eax), %esi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %ebp +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 76(%esp,%eax), %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %esi, %edx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%eax), %edx -; X86-NEXT: movl 72(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrdl %cl, %ebp, %ebx +; X86-NEXT: movl 68(%esp,%eax), %esi +; X86-NEXT: movl 72(%esp,%eax), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl ; X86-NEXT: addl %edi, %edi @@ -367,59 +371,56 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: orl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: shrdl %cl, %ebp, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: shrdl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: shldl $1, %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ebp -; X86-NEXT: orl %eax, %ebp +; X86-NEXT: orl %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl $1, %eax, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax @@ -427,122 +428,123 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: andl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: jne .LBB4_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: shldl $1, %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: shldl $1, %ebp, %ebx ; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: addl %ebp, %ebp +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: .LBB4_8: # %udiv-end -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: imull %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: imull %ecx, %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebp ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: subl (%esp), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl $132, %esp ; X86-NEXT: popl %esi @@ -558,24 +560,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%rbx) -; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: imulq %rax, %r14 -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rdx -; X64-NEXT: imulq %r15, %rcx +; X64-NEXT: movq %rdx, 8(%r15) +; X64-NEXT: movq %rax, (%r15) +; X64-NEXT: imulq %rax, %r12 +; X64-NEXT: mulq %r13 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: imulq %r13, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r13 -; X64-NEXT: sbbq %rcx, %r12 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %rdx +; X64-NEXT: subq %rax, %r14 +; X64-NEXT: sbbq %rcx, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -664,23 +666,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movzbl (%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm4 +; X86-NEXT: movd %edx, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm7 +; X86-NEXT: movd %esi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm5 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-NEXT: movdqa %xmm2, %xmm4 ; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; X86-NEXT: movdqa %xmm4, (%ecx) @@ -767,47 +769,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm2 +; X64-NEXT: movd %r10d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %r15d, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X64-NEXT: movd %edx, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm2, (%rax) +; X64-NEXT: movdqa %xmm4, (%rax) +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm2, %xmm4 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm2, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm6, %xmm1 -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: packuswb %xmm2, %xmm1 +; X64-NEXT: pmullw %xmm3, %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: packuswb %xmm4, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index 0ce461f648e45..03de1533e1d64 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1129,10 +1129,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill @@ -1174,10 +1174,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1187,10 +1187,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1200,10 +1200,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1213,10 +1213,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll index a8636a3496dc4..5ea2964057588 100644 --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -24,46 +24,45 @@ define i64 @fn1() #0 { ; CHECK-NEXT: .cfi_offset %esi, -20 ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 -; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D -; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F +; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D +; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F ; CHECK-NEXT: movl a, %edi ; CHECK-NEXT: cmpl $0, (%edi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl 8(%edi), %esi -; CHECK-NEXT: movl 12(%edi), %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shldl $1, %esi, %edx -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: leal (%esi,%esi), %eax -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 8(%edi), %ecx +; CHECK-NEXT: movl 12(%edi), %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shldl $1, %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: leal (%ecx,%ecx), %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 16(%edi), %ebx -; CHECK-NEXT: movl 20(%edi), %esi -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: shldl $2, %ebx, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: shldl $31, %eax, %ebx -; CHECK-NEXT: shll $2, %eax -; CHECK-NEXT: orl %ebx, %eax -; CHECK-NEXT: shrl %esi -; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: adcl %edx, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 20(%edi), %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: shldl $2, %ebx, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: shldl $31, %ebx, %ecx +; CHECK-NEXT: shll $2, %ebx +; CHECK-NEXT: orl %ecx, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: adcl %eax, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 24(%edi), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D ; CHECK-NEXT: imull %eax, %ebx -; CHECK-NEXT: mull %ecx -; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: mull %esi +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: addl %ebx, %edx ; CHECK-NEXT: movl 28(%edi), %edi -; CHECK-NEXT: imull %edi, %ecx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; CHECK-NEXT: movl %ebx, %eax @@ -72,17 +71,17 @@ define i64 @fn1() #0 { ; CHECK-NEXT: addl %edx, %ebx ; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E ; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: shrdl $3, %ecx, %esi -; CHECK-NEXT: sarl $3, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: orl %eax, %esi +; CHECK-NEXT: shrdl $3, %esi, %ecx +; CHECK-NEXT: sarl $3, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87 -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: mull %ebx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: imull $326129324, %esi, %eax # imm = 0x137056AC +; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -95,14 +94,14 @@ define i64 @fn1() #0 { ; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.else -; CHECK-NEXT: xorl b+4, %ebx -; CHECK-NEXT: xorl b, %ecx +; CHECK-NEXT: xorl b+4, %ecx +; CHECK-NEXT: xorl b, %esi ; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9 +; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9 ; CHECK-NEXT: addl %edx, %esi -; CHECK-NEXT: imull $1419758215, %ebx, %ecx # imm = 0x549FCA87 +; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87 ; CHECK-NEXT: .LBB0_3: # %if.end ; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63 diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index 5cd9281929a32..2856cfa01fad1 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -1212,10 +1212,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: movq %r14, %rbp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx ; CHECK-NEXT: cmovaq %r13, %rax -; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1230,11 +1231,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbp, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r13, %rax -; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: cmovaq %r14, %rax +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1249,10 +1250,12 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbp, %rdx +; CHECK-NEXT: movq %rbp, %r13 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r13, %rax +; CHECK-NEXT: cmovaq %r14, %rax +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1268,12 +1271,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rbp -; CHECK-NEXT: movq %r14, %r13 +; CHECK-NEXT: cmovbq %r13, %rbp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rbp -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: cmovaq %rcx, %rax +; CHECK-NEXT: movq %r15, %r13 +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1289,10 +1291,10 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %r14 -; CHECK-NEXT: cmovbq %r13, %r15 +; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %rax, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rax, %r15 +; CHECK-NEXT: cmovaq %r13, %r15 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index 36bf74f573155..065b396e82ec3 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -267,8 +267,8 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -276,16 +276,16 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: jne .LBB6_1 ; X86-FAST-NEXT: # %bb.2: ; X86-FAST-NEXT: movl %ebx, %ebp -; X86-FAST-NEXT: movl %edx, %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl %edi, %eax -; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl %esi, %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl %edi, %eax +; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %edx, %esi -; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: movl %edi, %esi ; X86-FAST-NEXT: movl %ebx, %edi ; X86-FAST-NEXT: movl %eax, %ebx ; X86-FAST-NEXT: jmp .LBB6_6 @@ -301,12 +301,12 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: shldl %cl, %ebp, %eax ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: shldl %cl, %ebx, %ebp -; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: movl %esi, %ebx ; X86-FAST-NEXT: shldl %cl, %edi, %ebx ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shldl %cl, %edx, %esi +; X86-FAST-NEXT: shldl %cl, %esi, %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl %esi, 12(%ecx) +; X86-FAST-NEXT: movl %edx, 12(%ecx) ; X86-FAST-NEXT: movl %ebx, 8(%ecx) ; X86-FAST-NEXT: movl %ebp, 4(%ecx) ; X86-FAST-NEXT: movl %eax, (%ecx) @@ -324,26 +324,26 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: pushl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: testb $64, %al ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: ; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %edi, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: movl %esi, %ebx +; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl %edi, %ecx +; X86-SLOW-NEXT: movl %esi, %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: je .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebx, %edi -; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %ebx +; X86-SLOW-NEXT: movl %edx, %edi ; X86-SLOW-NEXT: movl %ecx, %edx ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_1: @@ -364,30 +364,30 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %ebp ; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: movl %edi, %esi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: shrl %edx ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %edx ; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: movl %edi, %esi +; X86-SLOW-NEXT: movl %ebx, %esi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %ebx +; X86-SLOW-NEXT: shrl %edi ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %ebx -; X86-SLOW-NEXT: orl %esi, %ebx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: orl %esi, %edi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: shrl %ebx ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: orl %eax, %edi +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: orl %eax, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %edi, 12(%eax) -; X86-SLOW-NEXT: movl %ebx, 8(%eax) +; X86-SLOW-NEXT: movl %ebx, 12(%eax) +; X86-SLOW-NEXT: movl %edi, 8(%eax) ; X86-SLOW-NEXT: movl %edx, 4(%eax) ; X86-SLOW-NEXT: movl %ebp, (%eax) ; X86-SLOW-NEXT: addl $4, %esp diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 367a3dddb8640..4340f8fd484ae 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -263,19 +263,19 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %esi ; X86-FAST-NEXT: pushl %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: testb $64, %cl ; X86-FAST-NEXT: je .LBB6_1 ; X86-FAST-NEXT: # %bb.2: ; X86-FAST-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl %edi, %ebp -; X86-FAST-NEXT: movl %ebx, %edi +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl %esi, %ebp +; X86-FAST-NEXT: movl %ebx, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_4 @@ -287,19 +287,19 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: jne .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %esi, %ebx -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl %edi, %ebx +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: movl %ebp, %edx ; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-FAST-NEXT: .LBB6_5: ; X86-FAST-NEXT: shrdl %cl, %edx, %ebp -; X86-FAST-NEXT: shrdl %cl, %edi, %edx -; X86-FAST-NEXT: shrdl %cl, %esi, %edi +; X86-FAST-NEXT: shrdl %cl, %esi, %edx +; X86-FAST-NEXT: shrdl %cl, %edi, %esi ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shrdl %cl, %ebx, %esi -; X86-FAST-NEXT: movl %esi, 12(%eax) -; X86-FAST-NEXT: movl %edi, 8(%eax) +; X86-FAST-NEXT: shrdl %cl, %ebx, %edi +; X86-FAST-NEXT: movl %edi, 12(%eax) +; X86-FAST-NEXT: movl %esi, 8(%eax) ; X86-FAST-NEXT: movl %edx, 4(%eax) ; X86-FAST-NEXT: movl %ebp, (%eax) ; X86-FAST-NEXT: addl $4, %esp @@ -317,9 +317,9 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: testb $64, %cl ; X86-SLOW-NEXT: je .LBB6_1 @@ -327,15 +327,15 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movl %ebp, %eax ; X86-SLOW-NEXT: movl %ebx, %ebp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %esi, %edx -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %edi, %edx +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: jne .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, %edi -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %edi ; X86-SLOW-NEXT: movl %edx, %ebp ; X86-SLOW-NEXT: movl %eax, %edx ; X86-SLOW-NEXT: jmp .LBB6_6 @@ -357,28 +357,28 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: leal (%esi,%esi), %edx +; X86-SLOW-NEXT: leal (%edi,%edi), %edx ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %edx ; X86-SLOW-NEXT: orl %ebp, %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-SLOW-NEXT: leal (%esi,%esi), %ebp +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-SLOW-NEXT: leal (%edi,%edi), %ebp ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %ebp ; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: addl %esi, %esi ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: orl %edi, %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %edi, 12(%ecx) +; X86-SLOW-NEXT: movl %esi, 12(%ecx) ; X86-SLOW-NEXT: movl %ebp, 8(%ecx) ; X86-SLOW-NEXT: movl %edx, 4(%ecx) ; X86-SLOW-NEXT: movl %eax, (%ecx) diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index ec916148e8e20..4123890ed1a76 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -980,15 +980,15 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi -; X86-SSE2-NEXT: shrdl $8, %esi, %edi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi +; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx +; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi +; X86-SSE2-NEXT: shrdl $8, %edx, %edi ; X86-SSE2-NEXT: xorl %eax, %edi ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx +; X86-SSE2-NEXT: movzbl 10(%ecx,%esi,4), %ecx ; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %esi, %ecx +; X86-SSE2-NEXT: orl %edx, %ecx ; X86-SSE2-NEXT: shll $8, %ecx ; X86-SSE2-NEXT: movl %ecx, %edx ; X86-SSE2-NEXT: sarl $8, %edx diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index 1f64590ec03c8..bca446fa8fb56 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -684,10 +684,10 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $6, %xmm3, %r13d ; SSE3-NEXT: pextrw $7, %xmm3, %eax ; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm2 -; SSE3-NEXT: movd %r14d, %xmm3 +; SSE3-NEXT: movd %r12d, %xmm4 +; SSE3-NEXT: movd %r14d, %xmm2 ; SSE3-NEXT: movd %ebx, %xmm5 -; SSE3-NEXT: movd %r10d, %xmm4 +; SSE3-NEXT: movd %r10d, %xmm3 ; SSE3-NEXT: movd %r8d, %xmm6 ; SSE3-NEXT: movd %esi, %xmm7 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload @@ -702,13 +702,13 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: movd %r9d, %xmm14 ; SSE3-NEXT: movd %edi, %xmm15 ; SSE3-NEXT: movd %edx, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] @@ -1296,10 +1296,10 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $6, %xmm3, %r13d ; SSE3-NEXT: pextrw $7, %xmm3, %eax ; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm2 -; SSE3-NEXT: movd %r15d, %xmm3 +; SSE3-NEXT: movd %r12d, %xmm4 +; SSE3-NEXT: movd %r15d, %xmm2 ; SSE3-NEXT: movd %r14d, %xmm5 -; SSE3-NEXT: movd %ebp, %xmm4 +; SSE3-NEXT: movd %ebp, %xmm3 ; SSE3-NEXT: movd %r10d, %xmm6 ; SSE3-NEXT: movd %r9d, %xmm7 ; SSE3-NEXT: movd %esi, %xmm8 @@ -1314,13 +1314,13 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 348c2d0d5966f..1a2aac657d30f 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -364,17 +364,17 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl $1, %eax -; X86-BMI2-NEXT: xorl %edx, %edx +; X86-BMI2-NEXT: movl $1, %edx ; X86-BMI2-NEXT: xorl %esi, %esi -; X86-BMI2-NEXT: shldl %cl, %eax, %esi -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: xorl %eax, %eax +; X86-BMI2-NEXT: shldl %cl, %edx, %eax +; X86-BMI2-NEXT: shlxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl -; X86-BMI2-NEXT: cmovnel %eax, %esi ; X86-BMI2-NEXT: cmovnel %edx, %eax -; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-BMI2-NEXT: cmovnel %esi, %edx ; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: orl %esi, %eax +; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: orl %eax, %edx ; X86-BMI2-NEXT: sete %al ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index e1d2f9c343855..a026757a0264d 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -546,27 +546,26 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index fd806e56af080..9f58ed0843348 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -14,21 +14,21 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: mull %esi ; X86-NOBMI-NEXT: movl %edx, %esi -; X86-NOBMI-NEXT: movl %eax, %ebp -; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl %ebp, %ebx ; X86-NOBMI-NEXT: adcl $0, %esi ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %ecx -; X86-NOBMI-NEXT: addl %ebp, %eax +; X86-NOBMI-NEXT: addl %ebx, %eax ; X86-NOBMI-NEXT: adcl %edx, %esi ; X86-NOBMI-NEXT: setb %al ; X86-NOBMI-NEXT: movzbl %al, %edi @@ -105,7 +105,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: subl $24, %esp +; X86-NOBMI-NEXT: subl $20, %esp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: orl %ecx, %eax @@ -121,44 +121,44 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %esi +; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi ; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %esi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: mull %edi -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl %edx, %ebp ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %edi -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %ebx -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %ebp -; X86-NOBMI-NEXT: movl %esi, %eax +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebp, %esi +; X86-NOBMI-NEXT: adcl $0, %ebx +; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %edi -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebx, %esi -; X86-NOBMI-NEXT: adcl %ebp, %edi +; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: addl %esi, %edi +; X86-NOBMI-NEXT: adcl %ebx, %ebp ; X86-NOBMI-NEXT: setb %bl ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %edi, %eax -; X86-NOBMI-NEXT: movzbl %bl, %edi +; X86-NOBMI-NEXT: addl %ebp, %eax +; X86-NOBMI-NEXT: movzbl %bl, %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %edi, %edx +; X86-NOBMI-NEXT: adcl %esi, %edx ; X86-NOBMI-NEXT: movl %ecx, %ebx ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl $0, %eax ; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %ecx, (%edi,%ebx,8) +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) ; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %esi, 4(%edi,%ebx,8) +; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) ; X86-NOBMI-NEXT: addl $1, %ecx ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi @@ -171,7 +171,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: addl $24, %esp +; X86-NOBMI-NEXT: addl $20, %esp ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: popl %ebx @@ -209,14 +209,14 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: mulxl %eax, %edx, %edi ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %eax, %esi -; X86-BMI-NEXT: addl %edi, %eax -; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: mulxl %eax, %esi, %eax +; X86-BMI-NEXT: addl %edi, %esi +; X86-BMI-NEXT: adcl $0, %eax ; X86-BMI-NEXT: movl %ecx, %edx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %eax, %edi -; X86-BMI-NEXT: adcl %esi, %ebp +; X86-BMI-NEXT: addl %esi, %edi +; X86-BMI-NEXT: adcl %eax, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax ; X86-BMI-NEXT: setb %dl diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index 2c24db9afb54a..cf423227f23bc 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -87,34 +87,35 @@ define <2 x i256> @test_srl(<2 x i256> %In) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: shldl $28, %edx, %ebx -; X32-NEXT: shldl $28, %esi, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $28, %ecx, %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: shldl $28, %edi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: shldl $28, %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $28, %ebx, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shldl $28, %ecx, %ebx +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shldl $28, %edi, %esi ; X32-NEXT: shldl $28, %eax, %edi -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $28, %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shrdl $4, %eax, %edx +; X32-NEXT: shldl $28, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shrdl $4, %eax, %ecx ; X32-NEXT: shrl $4, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl %ebx, 56(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, 52(%eax) -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 56(%eax) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 52(%eax) ; X32-NEXT: movl %ebx, 48(%eax) -; X32-NEXT: movl %ecx, 44(%eax) +; X32-NEXT: movl %esi, 44(%eax) ; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %esi, 36(%eax) -; X32-NEXT: movl %edx, 32(%eax) +; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrl $31, %ecx ; X32-NEXT: movl %ecx, (%eax) @@ -182,34 +183,35 @@ define <2 x i256> @test_sra(<2 x i256> %In) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: shldl $26, %edx, %ebx -; X32-NEXT: shldl $26, %esi, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $26, %ecx, %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: shldl $26, %edi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: shldl $26, %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $26, %ebx, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shldl $26, %ecx, %ebx +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shldl $26, %edi, %esi ; X32-NEXT: shldl $26, %eax, %edi -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $26, %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shrdl $6, %eax, %edx +; X32-NEXT: shldl $26, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shrdl $6, %eax, %ecx ; X32-NEXT: sarl $6, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl %ebx, 56(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, 52(%eax) -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 56(%eax) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 52(%eax) ; X32-NEXT: movl %ebx, 48(%eax) -; X32-NEXT: movl %ecx, 44(%eax) +; X32-NEXT: movl %esi, 44(%eax) ; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %esi, 36(%eax) -; X32-NEXT: movl %edx, 32(%eax) +; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: sarl $31, %ecx ; X32-NEXT: movl %ecx, 28(%eax) diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll index bed0e3a24a017..f84960485840d 100644 --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -100,14 +100,14 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: cvttps2dq %xmm3, %xmm7 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm8 ; CHECK-NEXT: movaps %xmm3, %xmm4 ; CHECK-NEXT: cmpltps %xmm5, %xmm4 -; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16] +; CHECK-NEXT: movaps {{.*#+}} xmm7 = [13,14,15,16] ; CHECK-NEXT: movaps %xmm4, %xmm6 -; CHECK-NEXT: orps %xmm8, %xmm6 -; CHECK-NEXT: cvtdq2ps %xmm7, %xmm3 -; CHECK-NEXT: andps %xmm8, %xmm3 +; CHECK-NEXT: orps %xmm7, %xmm6 +; CHECK-NEXT: cvtdq2ps %xmm8, %xmm3 +; CHECK-NEXT: andps %xmm7, %xmm3 ; CHECK-NEXT: andps %xmm6, %xmm3 ; CHECK-NEXT: andnps %xmm4, %xmm6 ; CHECK-NEXT: cvttps2dq %xmm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index e07312c902d19..38abaf8ff11c6 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -183,36 +183,36 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: movdqa %xmm10, %xmm8 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: movdqa %xmm10, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm6 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 ; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index f2a3094b6b2d8..c8c5afbd579df 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1637,14 +1637,13 @@ entry: define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind { ; SSE-LABEL: test_mul8x8_f32: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: subq $120, %rsp ; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 @@ -1654,10 +1653,10 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm0, %xmm15 ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps %xmm3, %xmm10 ; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: mulps %xmm0, %xmm8 -; SSE-NEXT: addps %xmm5, %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm10 +; SSE-NEXT: addps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 @@ -1668,10 +1667,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm8, %xmm1 +; SSE-NEXT: addps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm14, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm6, %xmm14 @@ -1685,7 +1683,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 @@ -1702,56 +1700,56 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm5, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulps %xmm14, %xmm5 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: addps %xmm2, %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm11, %xmm8 ; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] @@ -1765,63 +1763,60 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps %xmm14, %xmm4 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm14, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 @@ -1835,96 +1830,91 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: mulps %xmm10, %xmm1 ; SSE-NEXT: mulps %xmm13, %xmm3 -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm14, %xmm13 -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm11[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: mulps %xmm14, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[1,1] -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,2] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: mulps %xmm10, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addps %xmm2, %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm7 ; SSE-NEXT: mulps %xmm3, %xmm7 ; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: mulps %xmm10, %xmm3 +; SSE-NEXT: mulps %xmm15, %xmm1 +; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, %xmm10 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: mulps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: mulps %xmm0, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 ; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 @@ -1937,8 +1927,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: mulps %xmm0, %xmm15 -; SSE-NEXT: addps %xmm1, %xmm15 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 @@ -1948,19 +1939,20 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm15, %xmm1 +; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] @@ -1969,24 +1961,26 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm14, %xmm15 ; SSE-NEXT: addps %xmm2, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: mulps %xmm7, %xmm14 ; SSE-NEXT: addps %xmm1, %xmm14 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: addps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm14 ; SSE-NEXT: mulps %xmm0, %xmm14 ; SSE-NEXT: addps %xmm1, %xmm14 ; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -1994,7 +1988,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm15 ; SSE-NEXT: addps %xmm0, %xmm15 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 ; SSE-NEXT: addps %xmm14, %xmm1 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] @@ -2019,28 +2014,28 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: mulps %xmm1, %xmm7 +; SSE-NEXT: mulps %xmm1, %xmm6 ; SSE-NEXT: movaps %xmm0, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1] ; SSE-NEXT: mulps %xmm15, %xmm13 -; SSE-NEXT: addps %xmm7, %xmm13 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm15 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: addps %xmm6, %xmm13 +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm15 ; SSE-NEXT: addps %xmm1, %xmm15 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm15, %xmm2 ; SSE-NEXT: mulps %xmm9, %xmm1 ; SSE-NEXT: addps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: mulps %xmm0, %xmm9 ; SSE-NEXT: addps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2048,7 +2043,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE-NEXT: mulps %xmm2, %xmm15 ; SSE-NEXT: addps %xmm0, %xmm15 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm2 ; SSE-NEXT: addps %xmm9, %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -2075,25 +2071,27 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: mulps %xmm2, %xmm13 -; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm8, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm9, %xmm5 -; SSE-NEXT: addps %xmm13, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulps %xmm9, %xmm8 +; SSE-NEXT: addps %xmm13, %xmm8 ; SSE-NEXT: mulps %xmm7, %xmm9 ; SSE-NEXT: addps %xmm2, %xmm9 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2] -; SSE-NEXT: mulps %xmm2, %xmm10 -; SSE-NEXT: addps %xmm9, %xmm10 -; SSE-NEXT: mulps %xmm12, %xmm2 -; SSE-NEXT: addps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm6 +; SSE-NEXT: addps %xmm9, %xmm6 +; SSE-NEXT: mulps %xmm11, %xmm2 +; SSE-NEXT: addps %xmm8, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: mulps %xmm0, %xmm8 -; SSE-NEXT: addps %xmm2, %xmm8 -; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: mulps %xmm0, %xmm9 +; SSE-NEXT: addps %xmm2, %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: addps %xmm6, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0] @@ -2101,7 +2099,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm2, %xmm13 ; SSE-NEXT: addps %xmm0, %xmm13 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addps %xmm8, %xmm2 +; SSE-NEXT: addps %xmm12, %xmm2 ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 @@ -2128,12 +2126,12 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm1, 192(%rdi) ; SSE-NEXT: movaps %xmm14, 176(%rdi) ; SSE-NEXT: movaps %xmm3, 160(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdi) ; SSE-NEXT: movaps %xmm4, 128(%rdi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdi) -; SSE-NEXT: movaps %xmm11, 96(%rdi) +; SSE-NEXT: movaps %xmm10, 96(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2146,7 +2144,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm0, 16(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: addq $88, %rsp +; SSE-NEXT: addq $120, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul8x8_f32: @@ -3287,48 +3285,48 @@ entry: define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind { ; SSE-LABEL: test_mul8x8_f64: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movapd %xmm7, %xmm14 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $328, %rsp # imm = 0x148 +; SSE-NEXT: movapd %xmm7, %xmm15 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, %xmm15 -; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movapd %xmm12, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: mulpd %xmm3, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: mulpd %xmm12, %xmm10 ; SSE-NEXT: movapd %xmm2, %xmm8 -; SSE-NEXT: mulpd %xmm3, %xmm8 +; SSE-NEXT: mulpd %xmm12, %xmm8 ; SSE-NEXT: movapd %xmm1, %xmm9 -; SSE-NEXT: mulpd %xmm3, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] +; SSE-NEXT: mulpd %xmm12, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] ; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: mulpd %xmm12, %xmm2 +; SSE-NEXT: mulpd %xmm13, %xmm2 ; SSE-NEXT: addpd %xmm10, %xmm2 ; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: mulpd %xmm12, %xmm7 +; SSE-NEXT: movapd %xmm6, %xmm10 +; SSE-NEXT: mulpd %xmm13, %xmm7 ; SSE-NEXT: addpd %xmm8, %xmm7 ; SSE-NEXT: movapd %xmm5, %xmm8 -; SSE-NEXT: mulpd %xmm12, %xmm8 +; SSE-NEXT: mulpd %xmm13, %xmm8 ; SSE-NEXT: addpd %xmm9, %xmm8 -; SSE-NEXT: mulpd %xmm4, %xmm12 -; SSE-NEXT: addpd %xmm3, %xmm12 +; SSE-NEXT: mulpd %xmm4, %xmm13 +; SSE-NEXT: addpd %xmm12, %xmm13 ; SSE-NEXT: movapd %xmm11, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm12, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm14, %xmm1 ; SSE-NEXT: mulpd %xmm6, %xmm1 -; SSE-NEXT: addpd %xmm8, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm8, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm6, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 @@ -3343,9 +3341,9 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm11, %xmm5 -; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addpd %xmm3, %xmm11 +; SSE-NEXT: addpd %xmm1, %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] @@ -3405,31 +3403,34 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movapd %xmm11, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 @@ -3455,7 +3456,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 @@ -3481,7 +3483,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: movapd %xmm6, %xmm3 @@ -3516,26 +3519,28 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: movapd %xmm8, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: movapd %xmm9, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd %xmm13, %xmm8 +; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm11, %xmm5 -; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: movapd %xmm10, %xmm15 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: movapd %xmm8, %xmm11 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: movapd %xmm12, %xmm10 +; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm9 +; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 @@ -3562,8 +3567,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 @@ -3577,8 +3581,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm6, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: addpd %xmm2, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3593,9 +3596,9 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm2, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movapd %xmm7, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 @@ -3608,205 +3611,209 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm4, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm10, %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm12 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd %xmm8, %xmm3 +; SSE-NEXT: movapd %xmm8, %xmm14 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm12, %xmm14 -; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movapd %xmm15, %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: mulpd %xmm10, %xmm0 +; SSE-NEXT: mulpd %xmm9, %xmm1 +; SSE-NEXT: movapd %xmm9, %xmm10 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: mulpd %xmm7, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 ; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd %xmm1, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm7, %xmm9 +; SSE-NEXT: addpd %xmm5, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm7, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm3, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm3, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: addpd %xmm2, %xmm5 -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: addpd %xmm2, %xmm15 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 -; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: movapd %xmm6, %xmm12 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm8, %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: movapd %xmm13, %xmm6 +; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movapd %xmm5, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm14, %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm3, %xmm6 -; SSE-NEXT: movapd %xmm11, %xmm8 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: movapd %xmm15, %xmm10 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm11, %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: movapd %xmm8, %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm14, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: mulpd %xmm9, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm7, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm7, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm1 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm2, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm7, %xmm2 +; SSE-NEXT: addpd %xmm9, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm6, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm6 -; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm1, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: mulpd %xmm9, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm7, %xmm9 -; SSE-NEXT: addpd %xmm6, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm7, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm9, %xmm10 +; SSE-NEXT: addpd %xmm7, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm2, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 +; SSE-NEXT: addpd %xmm9, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm6, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm9, %xmm6 +; SSE-NEXT: addpd %xmm10, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 @@ -3817,12 +3824,11 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm3, %xmm6 -; SSE-NEXT: addpd %xmm7, %xmm6 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm4, %xmm3 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm3, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3831,7 +3837,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm11, %xmm0 @@ -3846,239 +3852,245 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm12, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movapd %xmm15, %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm3, %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm5 -; SSE-NEXT: movapd %xmm14, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm7 +; SSE-NEXT: movapd %xmm6, %xmm7 ; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: addpd %xmm3, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movapd %xmm4, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: addpd %xmm3, %xmm9 ; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm10, %xmm1 -; SSE-NEXT: movapd %xmm10, %xmm14 +; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm9, %xmm3 +; SSE-NEXT: mulpd %xmm10, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm9, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm9, %xmm7 -; SSE-NEXT: addpd %xmm6, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm2, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm10, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm10, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addpd %xmm2, %xmm10 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: addpd %xmm9, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm0, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movapd %xmm7, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm0, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm3, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm3, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm3, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm3, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm7, %xmm10 +; SSE-NEXT: addpd %xmm9, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm6, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd %xmm7, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movapd %xmm8, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm6 -; SSE-NEXT: mulpd %xmm2, %xmm6 -; SSE-NEXT: addpd %xmm9, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm2, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm2, %xmm7 +; SSE-NEXT: addpd %xmm10, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm8, %xmm0 ; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm8, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm0, %xmm8 ; SSE-NEXT: addpd %xmm1, %xmm8 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm3 -; SSE-NEXT: addpd %xmm13, %xmm3 +; SSE-NEXT: addpd %xmm12, %xmm3 +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movapd %xmm14, %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movapd %xmm13, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: movapd %xmm15, %xmm10 -; SSE-NEXT: mulpd %xmm1, %xmm10 -; SSE-NEXT: addpd %xmm9, %xmm10 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm13 +; SSE-NEXT: addpd %xmm12, %xmm13 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: movapd %xmm6, %xmm14 +; SSE-NEXT: mulpd %xmm1, %xmm14 +; SSE-NEXT: addpd %xmm4, %xmm14 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: addpd %xmm1, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: addpd %xmm1, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm0, %xmm15 -; SSE-NEXT: addpd %xmm10, %xmm15 +; SSE-NEXT: addpd %xmm14, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: addpd %xmm13, %xmm14 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm15, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm2, %xmm15 -; SSE-NEXT: addpd %xmm1, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addpd %xmm9, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movapd %xmm9, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm12, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movapd %xmm12, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulpd %xmm1, %xmm3 ; SSE-NEXT: addpd %xmm2, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm15, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm1, %xmm15 -; SSE-NEXT: addpd %xmm0, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm10, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] +; SSE-NEXT: addpd %xmm14, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm1, %xmm14 +; SSE-NEXT: addpd %xmm0, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm12, %xmm4 ; SSE-NEXT: addpd %xmm1, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm9, %xmm10 -; SSE-NEXT: addpd %xmm15, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm9, %xmm15 -; SSE-NEXT: addpd %xmm2, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm3, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: addpd %xmm14, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm12, %xmm14 +; SSE-NEXT: addpd %xmm2, %xmm14 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: addpd %xmm3, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm3 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm3, %xmm9 -; SSE-NEXT: addpd %xmm15, %xmm9 -; SSE-NEXT: movapd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm3, %xmm12 +; SSE-NEXT: addpd %xmm14, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm4, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm7, %xmm15 -; SSE-NEXT: mulpd %xmm2, %xmm15 -; SSE-NEXT: addpd %xmm3, %xmm15 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: addpd %xmm3, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: mulpd %xmm2, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm12, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: mulpd %xmm6, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm12 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm13, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm4, %xmm14 -; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm4, %xmm10 +; SSE-NEXT: addpd %xmm1, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm4, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 @@ -4088,12 +4100,12 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm5 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm12, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 ; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: movapd %xmm1, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: mulpd %xmm0, %xmm6 ; SSE-NEXT: addpd %xmm5, %xmm6 @@ -4102,7 +4114,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm3 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm4 ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] @@ -4112,13 +4124,13 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movapd %xmm0, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm6, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm12, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 @@ -4130,8 +4142,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm14, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm10 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -4142,10 +4154,10 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm1, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm4, %xmm9 -; SSE-NEXT: addpd %xmm6, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm4, %xmm10 +; SSE-NEXT: addpd %xmm6, %xmm10 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] @@ -4154,7 +4166,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm10, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: mulpd %xmm0, %xmm6 ; SSE-NEXT: addpd %xmm1, %xmm6 @@ -4164,16 +4176,15 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm4, 480(%rdi) ; SSE-NEXT: movapd %xmm6, 464(%rdi) ; SSE-NEXT: movapd %xmm0, 448(%rdi) -; SSE-NEXT: movapd %xmm15, 432(%rdi) -; SSE-NEXT: movapd %xmm10, 416(%rdi) +; SSE-NEXT: movapd %xmm14, 432(%rdi) +; SSE-NEXT: movapd %xmm13, 416(%rdi) ; SSE-NEXT: movapd %xmm7, 400(%rdi) ; SSE-NEXT: movapd %xmm2, 384(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rdi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rdi) +; SSE-NEXT: movapd %xmm9, 336(%rdi) ; SSE-NEXT: movapd %xmm8, 320(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rdi) @@ -4188,8 +4199,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movaps %xmm0, 224(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdi) +; SSE-NEXT: movapd %xmm15, 192(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4214,7 +4224,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movaps %xmm0, 16(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: addq $328, %rsp # imm = 0x148 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul8x8_f64: @@ -4223,370 +4233,371 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $448, %rsp # imm = 0x1C0 -; AVX1-NEXT: vmovapd %ymm4, %ymm13 -; AVX1-NEXT: vmovapd %ymm3, %ymm9 -; AVX1-NEXT: vmovapd %ymm1, %ymm4 +; AVX1-NEXT: vmovapd %ymm2, %ymm12 ; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vmovapd 112(%rbp), %ymm12 -; AVX1-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm13 ; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, %ymm9 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm3 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm2, %ymm8 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vmovapd %ymm12, %ymm14 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmovapd %ymm13, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm2, %ymm13 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX1-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX1-NEXT: vmovapd %ymm9, %ymm12 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm3, %ymm8 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm5, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 -; AVX1-NEXT: vmovapd %ymm4, %ymm9 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX1-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vmovapd %ymm8, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 416(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm13, %ymm4 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm5, %ymm13 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm3, %ymm2 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm7, %ymm5 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm6, %ymm7 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 432(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm3 -; AVX1-NEXT: vmovapd %ymm9, %ymm6 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX1-NEXT: vmovapd %ymm9, %ymm13 ; AVX1-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovapd %ymm2, %ymm1 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 -; AVX1-NEXT: vmovapd %ymm8, %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmovapd %ymm15, %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm4, %ymm15 +; AVX1-NEXT: vmovapd %ymm4, %ymm3 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd %ymm13, %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm2, %ymm15 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm5, %ymm8 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm7, %ymm8 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm6, %ymm7 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 496(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmovapd 48(%rbp), %ymm4 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 504(%rbp), %ymm10 ; AVX1-NEXT: vmovapd 112(%rbp), %ymm2 ; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm13 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 520(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm2 -; AVX1-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX1-NEXT: vmovapd %ymm6, %ymm2 -; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX1-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm5, %ymm6 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vmovapd %ymm12, %ymm5 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovapd %ymm1, %ymm12 -; AVX1-NEXT: vmovapd %ymm14, %ymm6 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm3, %ymm12 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 552(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 560(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm4, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 568(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX1-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm12, %ymm0 -; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 624(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 48(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm14, %ymm4 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX1-NEXT: vmovapd %ymm13, %ymm3 +; AVX1-NEXT: vmulpd %ymm1, %ymm13, %ymm2 ; AVX1-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm13 -; AVX1-NEXT: vaddpd %ymm3, %ymm13, %ymm3 -; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm14 +; AVX1-NEXT: vmovapd %ymm6, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm0 +; AVX1-NEXT: vmovapd %ymm5, %ymm6 ; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm3 -; AVX1-NEXT: vmulpd %ymm3, %ymm8, %ymm13 -; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX1-NEXT: vmulpd %ymm3, %ymm7, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm3 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm3 -; AVX1-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 -; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX1-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm3 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm13 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm11 -; AVX1-NEXT: vmulpd %ymm13, %ymm11, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm3 -; AVX1-NEXT: vmulpd %ymm13, %ymm14, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm4 -; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmulpd %ymm2, %ymm7, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm11 +; AVX1-NEXT: vmulpd %ymm2, %ymm11, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm2 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm2 ; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm15, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm4, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vmulpd %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm15, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm4 ; AVX1-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm4 ; AVX1-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm4 ; AVX1-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 ; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm13, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vmovapd %ymm2, 480(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 448(%rdi) -; AVX1-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vmovapd %ymm3, 480(%rdi) +; AVX1-NEXT: vmovapd %ymm2, 448(%rdi) +; AVX1-NEXT: vmovapd %ymm1, 416(%rdi) ; AVX1-NEXT: vmovapd %ymm0, 384(%rdi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) @@ -4623,370 +4634,371 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $448, %rsp # imm = 0x1C0 -; AVX2-NEXT: vmovapd %ymm4, %ymm13 -; AVX2-NEXT: vmovapd %ymm3, %ymm9 -; AVX2-NEXT: vmovapd %ymm1, %ymm4 +; AVX2-NEXT: vmovapd %ymm2, %ymm12 ; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: vmovapd 112(%rbp), %ymm12 -; AVX2-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm13 ; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, %ymm9 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm3 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm2, %ymm8 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vmovapd %ymm12, %ymm14 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmovapd %ymm13, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm2, %ymm13 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX2-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX2-NEXT: vmovapd %ymm9, %ymm12 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm3, %ymm8 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm5, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 -; AVX2-NEXT: vmovapd %ymm4, %ymm9 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX2-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vmovapd %ymm8, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 416(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm13, %ymm4 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm5, %ymm13 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm3, %ymm2 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm7, %ymm5 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm6, %ymm7 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 432(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm3 -; AVX2-NEXT: vmovapd %ymm9, %ymm6 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX2-NEXT: vmovapd %ymm9, %ymm13 ; AVX2-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 -; AVX2-NEXT: vmovapd %ymm8, %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmovapd %ymm15, %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm4, %ymm15 +; AVX2-NEXT: vmovapd %ymm4, %ymm3 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd %ymm13, %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm2, %ymm15 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm5, %ymm8 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm7, %ymm8 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm6, %ymm7 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 496(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmovapd 48(%rbp), %ymm4 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 504(%rbp), %ymm10 ; AVX2-NEXT: vmovapd 112(%rbp), %ymm2 ; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm13 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 520(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX2-NEXT: vmovapd %ymm6, %ymm2 -; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm5, %ymm6 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vmovapd %ymm12, %ymm5 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovapd %ymm1, %ymm12 -; AVX2-NEXT: vmovapd %ymm14, %ymm6 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm3, %ymm12 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 552(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 560(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm4, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 568(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm12, %ymm0 -; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 624(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 48(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm14, %ymm4 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vmovapd %ymm13, %ymm3 +; AVX2-NEXT: vmulpd %ymm1, %ymm13, %ymm2 ; AVX2-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm13 -; AVX2-NEXT: vaddpd %ymm3, %ymm13, %ymm3 -; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm14 +; AVX2-NEXT: vmovapd %ymm6, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vmovapd %ymm5, %ymm6 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm3 -; AVX2-NEXT: vmulpd %ymm3, %ymm8, %ymm13 -; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX2-NEXT: vmulpd %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm3 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm3 -; AVX2-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 -; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX2-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm3 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm13 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm11 -; AVX2-NEXT: vmulpd %ymm13, %ymm11, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vmulpd %ymm13, %ymm14, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm4 -; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmulpd %ymm2, %ymm7, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm11 +; AVX2-NEXT: vmulpd %ymm2, %ymm11, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm15, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm2 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm4, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vmulpd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm15, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm4 ; AVX2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm4 ; AVX2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm4 ; AVX2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 ; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm13, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovapd %ymm2, 480(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 448(%rdi) -; AVX2-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm3, 480(%rdi) +; AVX2-NEXT: vmovapd %ymm2, 448(%rdi) +; AVX2-NEXT: vmovapd %ymm1, 416(%rdi) ; AVX2-NEXT: vmovapd %ymm0, 384(%rdi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 352(%rdi) diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 31c090a5a2b1d..acf4d900745d3 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2639,28 +2639,28 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; ; SSE41-LABEL: vec128_i8_unsigned_reg_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pminub %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE41-NEXT: pminub %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm2, %xmm1 +; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm4 -; SSE41-NEXT: pmullw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: packuswb %xmm4, %xmm2 -; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: pmullw %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -3115,26 +3115,26 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; ; SSE41-LABEL: vec128_i8_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa (%rdi), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminsb %xmm1, %xmm3 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pminsb %xmm2, %xmm3 +; SSE41-NEXT: pmaxsb %xmm0, %xmm2 +; SSE41-NEXT: psubb %xmm3, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: packuswb %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -3354,26 +3354,26 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-LABEL: vec128_i8_signed_mem_mem: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa (%rsi), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pminsb %xmm2, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm2 -; SSE41-NEXT: psubb %xmm0, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pminsb %xmm3, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm3 +; SSE41-NEXT: psubb %xmm0, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 ; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index a6116175226ea..27a9acf181ea2 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -144,28 +144,28 @@ define void @test1(ptr %A, ptr %B) { ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: paddd %xmm0, %xmm1 -; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X32-NEXT: pmuludq %xmm0, %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X32-NEXT: pmuludq %xmm0, %xmm2 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pand %xmm1, %xmm0 +; X32-NEXT: paddd %xmm1, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: por %xmm0, %xmm1 +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-NEXT: pmuludq %xmm1, %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X32-NEXT: pmuludq %xmm1, %xmm2 +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pand %xmm0, %xmm1 ; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: por %xmm1, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pxor %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: emms ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index c6801b5757ea1..6829356bf107e 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,397 +10,396 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $400, %esp # imm = 0x190 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 60(%ecx), %esi +; X32-NEXT: movl 60(%eax), %ebp +; X32-NEXT: movl 56(%eax), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%edx), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 56(%ecx), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %ebp -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 4(%ebx), %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 48(%edi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 48(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 52(%edi), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 52(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 8(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 40(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 44(%esi), %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 44(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl 32(%ebp), %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 32(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 36(%ebp), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 36(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %edi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 16(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %edx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edx, %esi -; X32-NEXT: mull %edx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 20(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %edi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl 28(%eax), %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -412,180 +411,180 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %edi -; X32-NEXT: adcl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -595,388 +594,390 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 24(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 24(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 28(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 28(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 16(%edi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 16(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 20(%edi), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 8(%esi), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl 12(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 8(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 12(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl (%ebp), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 4(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 4(%ebp), %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: adcl $0, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -988,179 +989,182 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %edi -; X32-NEXT: adcl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1174,11 +1178,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl $0, %eax ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1190,12 +1194,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1204,198 +1208,199 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 32(%ebx), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 32(%edi), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 36(%eax), %ecx -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 40(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %ebx +; X32-NEXT: movl 44(%eax), %ecx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1406,78 +1411,78 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1490,58 +1495,59 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 52(%eax), %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 52(%eax), %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1549,74 +1555,74 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 56(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 60(%eax), %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 60(%esi), %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1625,164 +1631,166 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: adcl %eax, %edi +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1795,27 +1803,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -1824,370 +1832,374 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edi, %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl (%esp), %edx # 4-byte Reload +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -2211,170 +2223,175 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2386,14 +2403,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2420,286 +2437,289 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 64(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 64(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 68(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 68(%eax), %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 72(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 76(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 72(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 76(%eax), %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movl %edi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2707,13 +2727,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 80(%eax), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx @@ -2721,274 +2741,276 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 84(%eax), %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 88(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl 88(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 92(%eax), %edi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 92(%eax), %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3001,152 +3023,152 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %edi -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: addl %esi, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %ebp, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3163,27 +3185,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -3191,7 +3212,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp @@ -3215,242 +3236,244 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 104(%esi), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 104(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 108(%esi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 108(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 96(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 100(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 96(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 100(%esi), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 112(%ecx), %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 112(%esi), %edi +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl 116(%ecx), %eax +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl 116(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx ; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 120(%ecx), %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %esi +; X32-NEXT: movl 120(%esi), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 124(%esi), %esi -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 124(%ecx), %ecx +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3465,41 +3488,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl %bl, %edi ; X32-NEXT: adcl %edi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -3516,7 +3537,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -3536,66 +3557,65 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 88(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 88(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 92(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 92(%esi), %ebx +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 80(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl 84(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 80(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 84(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3607,130 +3627,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 72(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 72(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 76(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 76(%ecx), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 64(%ebx), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 64(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 68(%ebx), %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 68(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3738,137 +3758,139 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %ecx, %esi ; X32-NEXT: movzbl %bl, %eax @@ -3878,73 +3900,72 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -3953,148 +3974,146 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ebx, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4107,198 +4126,199 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 96(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 96(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 100(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 100(%eax), %esi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 104(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 104(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 108(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: movl 108(%eax), %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: imull %ebp, %esi +; X32-NEXT: imull %edi, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4310,7 +4330,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi ; X32-NEXT: movl 124(%ebx), %eax ; X32-NEXT: imull %ecx, %eax @@ -4333,139 +4353,140 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %edi ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4473,81 +4494,83 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %edi, %esi ; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %edx +; X32-NEXT: imull %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: imull %esi, %edi -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %edi, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4557,7 +4580,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: addl %edx, %ebx @@ -4571,50 +4594,49 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %ebx ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %eax, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -4623,11 +4645,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4637,35 +4661,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4677,20 +4699,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -4698,34 +4719,35 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, (%ecx) @@ -4771,22 +4793,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, 80(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 84(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 88(%ecx) +; X32-NEXT: movl %ebp, 88(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 92(%ecx) -; X32-NEXT: movl %ebp, 96(%ecx) -; X32-NEXT: movl %ebx, 100(%ecx) +; X32-NEXT: movl %ebx, 96(%ecx) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %eax, 100(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 104(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 108(%ecx) -; X32-NEXT: movl %edi, 112(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 116(%ecx) +; X32-NEXT: movl %eax, 112(%ecx) +; X32-NEXT: movl %edi, 116(%ecx) +; X32-NEXT: movl %edx, 120(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 120(%ecx) -; X32-NEXT: movl %edx, 124(%ecx) +; X32-NEXT: movl %eax, 124(%ecx) ; X32-NEXT: addl $400, %esp # imm = 0x190 ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -4806,14 +4828,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 40(%rdi), %rbx -; X64-NEXT: movq 32(%rdi), %r14 +; X64-NEXT: movq 32(%rdi), %r12 ; X64-NEXT: movq 56(%rdi), %r15 ; X64-NEXT: movq 48(%rdi), %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %r11 -; X64-NEXT: movq 8(%rsi), %r8 -; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi @@ -4825,560 +4847,553 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r9, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %r8 ; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi -; X64-NEXT: adcq %r9, %r13 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r9, %r15 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq %r8, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r13 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%r13), %r8 +; X64-NEXT: movq %r12, %r10 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r12), %r8 -; X64-NEXT: movq %r14, %r10 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rdi, %r12 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 24(%r12), %rbp +; X64-NEXT: movq 24(%r13), %rbp ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: adcq %r9, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %r12, %rax +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: adcq %r9, %r13 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r12, %r9 +; X64-NEXT: addq %r13, %r9 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbx, %rcx -; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: addq %r15, %r14 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %r13, %rdi +; X64-NEXT: adcq %rcx, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r9, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %r11 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %r9, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r10b, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 16(%r8), %rsi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq 16(%r14), %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 24(%r8), %r14 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 24(%r14), %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rcx, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %rdi, %rbx -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %r13, %r12 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: setb %sil +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rbp, %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbx, %rdi -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r15, %rdi +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq (%r8), %r13 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq (%r14), %rbp +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq 8(%r8), %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r11, %r14 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %r11 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq 8(%r14), %r14 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rsi, %r12 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq %r15, %rsi ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r8, %rbp -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx +; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %r9, %r13 +; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r13, %r10 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rsi, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: adcq %r9, %r12 -; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r9, %rbp +; X64-NEXT: setb %r9b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r13, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcq %rcx, %r9 +; X64-NEXT: adcq %rcx, %r15 ; X64-NEXT: setb %r10b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: addq %rcx, %r12 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbp +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r12, %r11 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: setb %dil ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbx, %rcx -; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: setb %r11b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movzbl %r11b, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: movzbl %r10b, %esi -; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: movq %r8, %r9 +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq %r15, %r11 +; X64-NEXT: movzbl %r10b, %ecx +; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %rdi -; X64-NEXT: movq %r8, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 32(%r8), %rcx +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rbx, %r14 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 40(%rcx), %rsi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 40(%r8), %rsi +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %r9, %rbx +; X64-NEXT: adcq %rdi, %r15 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: addq %r15, %r11 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %r15, %rbx -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r15, %rbp +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r14, %r10 -; X64-NEXT: setb %r15b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: movzbl %r15b, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: adcq %r13, %r10 +; X64-NEXT: setb %bl +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r10, %rbp +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r12, %rbp +; X64-NEXT: adcq %rsi, %r15 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq 48(%rcx), %rcx -; X64-NEXT: movq %rbp, %r15 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 48(%r8), %rcx +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, %r12 ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq 56(%r8), %rsi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r13, %r9 +; X64-NEXT: adcq %r10, %r14 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r15, %r13 +; X64-NEXT: addq %r14, %r13 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: movq %rbp, %r8 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: adcq %rbx, %rdi +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %r15, %r9 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r11, %r13 -; X64-NEXT: adcq %r9, %rsi -; X64-NEXT: setb %bpl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %r11b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r14, %rbp +; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r8, %rdi ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %rsi, %r9 -; X64-NEXT: movzbl %bpl, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq %rsi, %rbp +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 -; X64-NEXT: adcq %rsi, %r15 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r15, %rsi -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %r13, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %r15 -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %rdi, %rbp -; X64-NEXT: adcq %r12, %r13 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r11, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r11, %r13 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r15, %rax -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r8, %r12 +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r8, %r13 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: movq %r15, %rbp -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbx +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %r10 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: adcq %rbp, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r9, %rbp +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: addq %r13, %r14 +; X64-NEXT: movq %r14, %r13 +; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq %rcx, %r9 ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r8, %rax ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r12, %r10 -; X64-NEXT: adcq %rdi, %r8 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r9, %r8 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5389,25 +5404,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 64(%r9), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq 64(%r9), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq 72(%r9), %rsi -; X64-NEXT: movq %r9, %rcx -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r8, %rbx @@ -5415,470 +5429,475 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: setb %r8b ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %r10, %r9 ; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %r12 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r12, %rcx +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r13 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %r15, %r12 +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r8, %rbp ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r11, %rbp -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: adcq %rbx, %r15 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq 80(%rcx), %r15 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 80(%r13), %r14 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r8, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 88(%rbx), %rbx -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq 88(%r13), %rbx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r8, %r13 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %r9, %r13 -; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: adcq %rdi, %r12 ; X64-NEXT: setb %bpl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rdi, %r10 ; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r15 ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %r8b ; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, %r9 ; X64-NEXT: mulq %rbx ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movzbl %r8b, %ecx -; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx ; X64-NEXT: addq %r13, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %bpl, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rbx ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rbx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %r15 -; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: imulq %rcx, %r14 +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi +; X64-NEXT: imulq %rsi, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r10, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rsi, %rbx +; X64-NEXT: imulq %r11, %rbx ; X64-NEXT: addq %rdx, %rbx ; X64-NEXT: addq %r8, %rdi -; X64-NEXT: adcq %r15, %rbx -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: adcq %r14, %rbx +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq %rbx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 112(%r9), %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 112(%rcx), %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: movq 120(%r9), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %rbp -; X64-NEXT: addq %rax, %rbx -; X64-NEXT: movq 96(%r9), %r10 -; X64-NEXT: movq 104(%r9), %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: imulq %rdi, %r12 ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r10, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %rbx, %r14 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: imulq %r11, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq 120(%rcx), %rax +; X64-NEXT: imulq %rdi, %rax +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: movq 96(%rcx), %r13 +; X64-NEXT: movq 104(%rcx), %r8 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %rbx +; X64-NEXT: imulq %r8, %rbx +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: imulq %r13, %r9 +; X64-NEXT: addq %rdx, %r9 +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r8, %r12 +; X64-NEXT: addq %r10, %r12 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %rbp, %r10 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: setb %bl +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: addq %r10, %rax -; X64-NEXT: movzbl %r8b, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %r13, %rax -; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: movzbl %bl, %r8d +; X64-NEXT: adcq %r8, %rdx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %r15, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: adcq %r11, %rbx -; X64-NEXT: adcq %r15, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rax ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq 80(%r13), %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%r13), %r11 -; X64-NEXT: movq %r13, %r10 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 80(%rdi), %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq %r10, %rdi -; X64-NEXT: movq 64(%r10), %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 88(%rdi), %r15 +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 72(%rdi), %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r9, %rdi +; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %r10 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r11, %r9 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r12, %rax +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: movq 64(%r14), %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq 72(%r14), %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r11, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r10, %rdi -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r11, %rbp +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %rsi, %rbp +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %r10, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r13, %rdi +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %rbp, %r9 ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: adcq %rbx, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq %r13, %r10 -; X64-NEXT: setb %dil -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r14, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r12, %rdi +; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: setb %cl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r8, %r9 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 96(%rdi), %rsi -; X64-NEXT: imulq %rsi, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 96(%rcx), %rsi +; X64-NEXT: imulq %rsi, %r9 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %r8, %rcx -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r15, %rdx -; X64-NEXT: movq 104(%rdi), %r9 -; X64-NEXT: imulq %r9, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r9, %rdx +; X64-NEXT: movq 104(%rcx), %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: imulq %r9, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq 112(%rcx), %rax ; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq 112(%rdi), %rax ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: imulq %r12, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: imulq %r10, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq 120(%rdi), %rdi -; X64-NEXT: imulq %r15, %rdi -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: addq %r10, %r8 -; X64-NEXT: adcq %r14, %rdi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq 120(%r14), %r13 +; X64-NEXT: imulq %rbx, %r13 +; X64-NEXT: addq %rdx, %r13 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r13, %rbp -; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: adcq %r11, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r8, %r9 +; X64-NEXT: adcq %r13, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: imulq %r15, %rdi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: imulq %r10, %rdi +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: imulq %r12, %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: imulq %r9, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: imulq %r8, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: imulq %r11, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: adcq %r8, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: imulq %rdi, %rbp +; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: adcq %r13, %rbp +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r8, %rsi -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: addq %r8, %rax +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %r11, %rax +; X64-NEXT: adcq %rbp, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq %r12, %rcx +; X64-NEXT: adcq %r9, %rax ; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %rbp, %r11 -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload @@ -5887,10 +5906,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload @@ -5898,9 +5917,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload @@ -5923,9 +5942,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r8, 64(%rsi) ; X64-NEXT: movq %r9, 72(%rsi) ; X64-NEXT: movq %r10, 80(%rsi) -; X64-NEXT: movq %rbx, 88(%rsi) -; X64-NEXT: movq %rcx, 96(%rsi) -; X64-NEXT: movq %r11, 104(%rsi) +; X64-NEXT: movq %r11, 88(%rsi) +; X64-NEXT: movq %r13, 96(%rsi) +; X64-NEXT: movq %rcx, 104(%rsi) ; X64-NEXT: movq %rax, 112(%rsi) ; X64-NEXT: movq %rdx, 120(%rsi) ; X64-NEXT: addq $240, %rsp diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 83e91063cf84f..6f6dde3aa3cf4 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -21,73 +21,73 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 12(%ecx), %esi -; X32-NEXT: movl 8(%ecx), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%edx), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 4(%eax), %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl (%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 4(%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 4(%esi), %ebp +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %esi ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -95,111 +95,115 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 8(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl 12(%eax), %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %edi -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl 16(%ecx), %esi +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: imull %esi, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %edi, %edx ; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: addl %edx, %ebp +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: imull %eax, %edi +; X32-NEXT: addl %edx, %edi ; X32-NEXT: movl 24(%ecx), %eax ; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 28(%ecx), %ecx -; X32-NEXT: imull %esi, %ecx +; X32-NEXT: imull %ebx, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload @@ -219,80 +223,80 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 24(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 24(%edi), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %edi, %edx ; X32-NEXT: movl 28(%edi), %eax -; X32-NEXT: imull %esi, %eax -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl 16(%edi), %ebp -; X32-NEXT: movl 20(%edi), %ebx +; X32-NEXT: imull %ecx, %eax +; X32-NEXT: addl %eax, %esi +; X32-NEXT: movl 16(%edi), %edi +; X32-NEXT: movl 20(%edx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: imull %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, (%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 4(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 12(%ecx) -; X32-NEXT: movl %ebx, 16(%ecx) -; X32-NEXT: movl %esi, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) -; X32-NEXT: movl %edx, 28(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 4(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 8(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 12(%esi) +; X32-NEXT: movl %ecx, 16(%esi) +; X32-NEXT: movl %edi, 20(%esi) +; X32-NEXT: movl %eax, 24(%esi) +; X32-NEXT: movl %edx, 28(%esi) ; X32-NEXT: addl $72, %esp ; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index c39c8337ca455..4a0f0ad94cef0 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -9,71 +9,71 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $184, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: subl $180, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl 28(%edx), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 24(%edx), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ebx +; X32-NEXT: movl 24(%eax), %ebp +; X32-NEXT: movl (%edx), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: movl 4(%eax), %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 20(%ecx), %ebx +; X32-NEXT: movl 16(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 20(%ecx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -81,211 +81,214 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 8(%edi), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 8(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl 12(%edi), %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 8(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl 12(%ecx), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 12(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: mull %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl 4(%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl 4(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -295,302 +298,300 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 16(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ebx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %ebp, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %edi, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -606,410 +607,411 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 32(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 32(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 36(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %esi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 40(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 44(%edi), %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: imull %ebp, %esi +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 56(%ebx), %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 56(%edi), %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 60(%ebx), %eax -; X32-NEXT: imull %ecx, %eax -; X32-NEXT: addl %eax, %esi -; X32-NEXT: movl 48(%ebx), %edi -; X32-NEXT: movl 52(%ebx), %ebp +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl 60(%edi), %eax +; X32-NEXT: imull %ebp, %eax +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: movl 48(%edi), %esi +; X32-NEXT: movl 52(%edi), %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: imull %ebp, %ebx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %esi, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 40(%edi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 44(%edi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl 44(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 32(%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 32(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 36(%ecx), %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 36(%esi), %ebp ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 48(%ecx), %edi -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 48(%esi), %edi +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl 52(%ecx), %eax +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl 52(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx ; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 56(%ecx), %eax +; X32-NEXT: movl 56(%esi), %eax ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: imull %ebp, %esi @@ -1025,78 +1027,77 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %edi +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1107,7 +1108,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -1164,7 +1165,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, 52(%ecx) ; X32-NEXT: movl %eax, 56(%ecx) ; X32-NEXT: movl %edx, 60(%ecx) -; X32-NEXT: addl $184, %esp +; X32-NEXT: addl $180, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -1181,200 +1182,200 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rdi), %rbx -; X64-NEXT: movq 8(%rdi), %r9 -; X64-NEXT: movq 24(%rdi), %r12 -; X64-NEXT: movq 16(%rdi), %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: movq 8(%rdi), %rdi +; X64-NEXT: movq 24(%rax), %r14 +; X64-NEXT: movq 16(%rax), %rax +; X64-NEXT: movq (%rsi), %r8 ; X64-NEXT: movq 8(%rsi), %r11 -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r10, %rcx -; X64-NEXT: adcq %r8, %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r14, %r10 -; X64-NEXT: adcq %rsi, %r13 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r14, %r15 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: adcq %rbx, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq %rcx, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%rdi), %r8 -; X64-NEXT: movq %r12, %r11 +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %r15, %r14 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%r13), %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r12 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq 24(%rsi), %rsi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 -; X64-NEXT: adcq %rbx, %r9 -; X64-NEXT: setb %bl +; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: setb %dil ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r9, %rcx -; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rbp, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r14, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r10, %rcx -; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: setb %r12b +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: setb %dil +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbx, %rax ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rbp -; X64-NEXT: setb %dil -; X64-NEXT: movq %r10, %rax +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movzbl %dil, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rbp, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %rbx ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r12b, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 32(%rdi), %r15 ; X64-NEXT: imulq %r15, %rsi ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq 40(%rcx), %rsi -; X64-NEXT: imulq %rsi, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq 48(%rcx), %rax -; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq 40(%rdi), %rsi +; X64-NEXT: imulq %rsi, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq 48(%rdi), %rax +; X64-NEXT: movq %rdi, %r8 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%r11), %r11 -; X64-NEXT: imulq %rbx, %r11 -; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: addq %r9, %rcx -; X64-NEXT: adcq %r8, %r11 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r8 +; X64-NEXT: movq 56(%r8), %r8 +; X64-NEXT: imulq %r11, %r8 +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq %r9, %r15 +; X64-NEXT: addq %r15, %r13 +; X64-NEXT: adcq %rdi, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r15, %r8 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq %r11, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 48(%r9), %rsi +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r8, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 48(%r8), %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi @@ -1382,68 +1383,69 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: imulq %r14, %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %r9, %rdx -; X64-NEXT: movq 56(%r9), %rax +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: movq 56(%r8), %rax ; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rdi, %r8 ; X64-NEXT: addq %rax, %rsi -; X64-NEXT: movq 32(%r9), %r9 -; X64-NEXT: movq 40(%rdx), %r15 +; X64-NEXT: movq 32(%rdx), %rbp +; X64-NEXT: movq 40(%rdx), %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: imulq %r15, %r11 -; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r11, %rdx -; X64-NEXT: imulq %r9, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: imulq %r9, %rdi +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: imulq %rbp, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: adcq %rbp, %rsi -; X64-NEXT: setb %bl -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: setb %dil +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %bl, %esi -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r13, %r9 -; X64-NEXT: adcq %r8, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %r8 +; X64-NEXT: adcq %r10, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: movq %rdi, (%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 8(%rsi) +; X64-NEXT: movq %rdi, 8(%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 16(%rsi) +; X64-NEXT: movq %rdi, 16(%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 24(%rsi) -; X64-NEXT: movq %rcx, 32(%rsi) -; X64-NEXT: movq %r9, 40(%rsi) -; X64-NEXT: movq %rax, 48(%rsi) -; X64-NEXT: movq %rdx, 56(%rsi) +; X64-NEXT: movq %rdi, 24(%rcx) +; X64-NEXT: movq %rsi, 32(%rcx) +; X64-NEXT: movq %r8, 40(%rcx) +; X64-NEXT: movq %rax, 48(%rcx) +; X64-NEXT: movq %rdx, 56(%rcx) ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll index 4e63580beb148..fc1cc1f65627a 100644 --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -24,71 +24,70 @@ define i128 @foo(i128 %t, i128 %u) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 28 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ebp, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%esi) -; X86-NEXT: movl %eax, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll index 38082c94a727f..8b75c6fb68c78 100644 --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -17,29 +17,29 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: movq %rdi, %r10 ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: imulq %rdx, %r8 +; CHECK-NEXT: movq %rcx, %rdi +; CHECK-NEXT: imulq %rdx, %rdi ; CHECK-NEXT: movq %r11, %rax ; CHECK-NEXT: mulq %rdx -; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: addq %rax, %rdi -; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: addq %rax, %r9 +; CHECK-NEXT: addq %rdi, %r9 ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: sarq $63, %rax ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: imulq %rsi, %r14 ; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: movq %rdx, %r8 -; CHECK-NEXT: addq %r14, %r8 -; CHECK-NEXT: addq %rax, %r8 -; CHECK-NEXT: addq %rbx, %r9 -; CHECK-NEXT: adcq %rdi, %r8 +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: addq %r14, %rdi +; CHECK-NEXT: addq %rax, %rdi +; CHECK-NEXT: addq %rbx, %r8 +; CHECK-NEXT: adcq %r9, %rdi ; CHECK-NEXT: movq %r10, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: movq %rax, %r9 ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %r11 @@ -58,8 +58,8 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: mulq %rcx ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: adcq %r11, %rdx -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: adcq %r8, %rdx +; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: adcq %rdi, %rdx ; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx @@ -67,7 +67,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: orq %rdx, %rcx ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: movq %r10, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll index 1756154272018..ce672a70b1f91 100644 --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -45,12 +45,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; LINUX-NEXT: movq %r9, %r14 -; LINUX-NEXT: movq %r8, %r15 -; LINUX-NEXT: movq %rcx, %r12 -; LINUX-NEXT: movq %rdx, %r13 -; LINUX-NEXT: movq %rsi, %rbp +; LINUX-NEXT: movl %eax, %ebp +; LINUX-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-NEXT: movq %r8, %r14 +; LINUX-NEXT: movq %rcx, %r15 +; LINUX-NEXT: movq %rdx, %r12 +; LINUX-NEXT: movq %rsi, %r13 ; LINUX-NEXT: movq %rdi, %rbx ; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) @@ -78,12 +78,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: callq get_f@PLT ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %rbx, %rdi -; LINUX-NEXT: movq %rbp, %rsi -; LINUX-NEXT: movq %r13, %rdx -; LINUX-NEXT: movq %r12, %rcx -; LINUX-NEXT: movq %r15, %r8 -; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-NEXT: movq %r14, %r9 +; LINUX-NEXT: movq %r13, %rsi +; LINUX-NEXT: movq %r12, %rdx +; LINUX-NEXT: movq %r15, %rcx +; LINUX-NEXT: movq %r14, %r8 +; LINUX-NEXT: movl %ebp, %eax +; LINUX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -138,12 +138,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; LINUX-X32-NEXT: movq %r9, %r14 -; LINUX-X32-NEXT: movq %r8, %r15 -; LINUX-X32-NEXT: movq %rcx, %r12 -; LINUX-X32-NEXT: movq %rdx, %r13 -; LINUX-X32-NEXT: movq %rsi, %rbp +; LINUX-X32-NEXT: movl %eax, %ebp +; LINUX-X32-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-NEXT: movq %r8, %r14 +; LINUX-X32-NEXT: movq %rcx, %r15 +; LINUX-X32-NEXT: movq %rdx, %r12 +; LINUX-X32-NEXT: movq %rsi, %r13 ; LINUX-X32-NEXT: movq %rdi, %rbx ; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) @@ -171,12 +171,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: callq get_f@PLT ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movq %rbx, %rdi -; LINUX-X32-NEXT: movq %rbp, %rsi -; LINUX-X32-NEXT: movq %r13, %rdx -; LINUX-X32-NEXT: movq %r12, %rcx -; LINUX-X32-NEXT: movq %r15, %r8 -; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-X32-NEXT: movq %r14, %r9 +; LINUX-X32-NEXT: movq %r13, %rsi +; LINUX-X32-NEXT: movq %r12, %rdx +; LINUX-X32-NEXT: movq %r15, %rcx +; LINUX-X32-NEXT: movq %r14, %r8 +; LINUX-X32-NEXT: movl %ebp, %eax +; LINUX-X32-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 529e0ad24936a..f0fb89496aa1e 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1396,72 +1396,72 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm1 +; SSE2-NEXT: movdqu 64(%rdi), %xmm2 ; SSE2-NEXT: movups 80(%rdi), %xmm4 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm2 -; SSE2-NEXT: movups 32(%rdi), %xmm5 -; SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; SSE2-NEXT: movaps %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[2,0] +; SSE2-NEXT: movdqu 16(%rdi), %xmm3 +; SSE2-NEXT: movups 32(%rdi), %xmm6 +; SSE2-NEXT: movdqu 48(%rdi), %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[2,0] ; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[2,0] -; SSE2-NEXT: movaps %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[2,0] +; SSE2-NEXT: movaps %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0] -; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0] +; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] ; SSE2-NEXT: movups %xmm10, 16(%rsi) ; SSE2-NEXT: movups %xmm8, (%rsi) -; SSE2-NEXT: movups %xmm3, 16(%rdx) +; SSE2-NEXT: movups %xmm1, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) ; SSE2-NEXT: movups %xmm9, 16(%rcx) -; SSE2-NEXT: movups %xmm7, (%rcx) +; SSE2-NEXT: movups %xmm5, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: ; SSE42: # %bb.0: ; SSE42-NEXT: movups 80(%rdi), %xmm0 ; SSE42-NEXT: movdqu 64(%rdi), %xmm1 -; SSE42-NEXT: movdqu (%rdi), %xmm3 +; SSE42-NEXT: movdqu (%rdi), %xmm4 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 -; SSE42-NEXT: movups 32(%rdi), %xmm4 +; SSE42-NEXT: movups 32(%rdi), %xmm3 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[1] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1] ; SSE42-NEXT: movdqa %xmm1, %xmm8 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7] ; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) -; SSE42-NEXT: movups %xmm3, (%rsi) +; SSE42-NEXT: movups %xmm4, (%rsi) ; SSE42-NEXT: movdqu %xmm10, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) ; SSE42-NEXT: movups %xmm9, 16(%rcx) @@ -1635,38 +1635,38 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: movups 16(%rsi), %xmm0 -; SSE2-NEXT: movups (%rdx), %xmm2 +; SSE2-NEXT: movups (%rdx), %xmm3 ; SSE2-NEXT: movups 16(%rdx), %xmm5 ; SSE2-NEXT: movups (%rcx), %xmm4 -; SSE2-NEXT: movups 16(%rcx), %xmm6 -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] -; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE2-NEXT: movaps %xmm6, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE2-NEXT: movaps %xmm0, %xmm9 +; SSE2-NEXT: movups 16(%rcx), %xmm7 +; SSE2-NEXT: movaps %xmm4, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[1,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm8 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; SSE2-NEXT: movaps %xmm7, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3] +; SSE2-NEXT: movaps %xmm0, %xmm6 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2] +; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE2-NEXT: movups %xmm4, 16(%rdi) -; SSE2-NEXT: movups %xmm9, 48(%rdi) -; SSE2-NEXT: movups %xmm6, 64(%rdi) -; SSE2-NEXT: movups %xmm3, (%rdi) +; SSE2-NEXT: movups %xmm6, 48(%rdi) +; SSE2-NEXT: movups %xmm7, 64(%rdi) +; SSE2-NEXT: movups %xmm2, (%rdi) ; SSE2-NEXT: movups %xmm1, 32(%rdi) ; SSE2-NEXT: movups %xmm0, 80(%rdi) ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index f57defc368d5e..2f557679a1558 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -157,35 +157,35 @@ define void @PR42833() { ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movdqa c+144(%rip), %xmm1 +; SSE2-NEXT: movdqa c+144(%rip), %xmm2 ; SSE2-NEXT: addl c+128(%rip), %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE2-NEXT: movdqa %xmm1, c+144(%rip) +; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm1 +; SSE2-NEXT: movdqa c+160(%rip), %xmm2 ; SSE2-NEXT: movdqa c+176(%rip), %xmm3 ; SSE2-NEXT: movdqa d+160(%rip), %xmm5 ; SSE2-NEXT: movdqa d+176(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: psubd %xmm0, %xmm7 ; SSE2-NEXT: psubd %xmm3, %xmm6 -; SSE2-NEXT: psubd %xmm1, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm5 ; SSE2-NEXT: movdqa %xmm5, d+160(%rip) ; SSE2-NEXT: movdqa %xmm6, d+176(%rip) ; SSE2-NEXT: movdqa %xmm4, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) ; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, c+160(%rip) +; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, c+160(%rip) ; SSE2-NEXT: movdqa %xmm3, c+176(%rip) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index b0739b6f47458..4b398095b549d 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: imull %esi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: imull %ebp, %ecx ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %eax, %ebp -; CHECK-NEXT: sarl $31, %ebp -; CHECK-NEXT: shrl $30, %ebp -; CHECK-NEXT: addl %eax, %ebp -; CHECK-NEXT: sarl $2, %ebp -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: shrl $30, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: sarl $2, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%edi) -; CHECK-NEXT: incl %edi -; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%esi) +; CHECK-NEXT: incl %esi +; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %esi, %edx -; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl %ebp, %edx +; CHECK-NEXT: cmpl %edi, %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) +; CHECK-NEXT: cmpl $1, %edi ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %esi +; CHECK-NEXT: cmpl $2, %ebp ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: shrl $31, %ebp -; CHECK-NEXT: addl %esi, %ebp -; CHECK-NEXT: sarl %ebp +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: addl %ebp, %edx +; CHECK-NEXT: sarl %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,12 +84,12 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl $2, %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: addl $2, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload +; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_9: ## %bb13 @@ -97,90 +97,89 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %edi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%ebx) -; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%ebx) -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: cmpl %ebp, %ebx +; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) +; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: incl %esi +; CHECK-NEXT: cmpl %edx, %esi ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx -; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmpl $1, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %ecx +; CHECK-NEXT: cmpl $3, %eax ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edx), %eax +; CHECK-NEXT: leal 15(%edi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %ebp -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %eax, %ebx -; CHECK-NEXT: leal 15(%esi), %eax +; CHECK-NEXT: addl %ebx, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload +; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: addl %ecx, %ebx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: movl %ebp, %edi -; CHECK-NEXT: movl %ebx, %ebp +; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: movl %ebx, %esi ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %ebp, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %esi, %ebp -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edx +; CHECK-NEXT: addl %ebp, %ebx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edi ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: leal 15(%esi), %eax +; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -188,32 +187,30 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: pushl %edi -; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %esi, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edx +; CHECK-NEXT: addl %ebp, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll index b98ebdeb6b890..5900e7674cd0e 100644 --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -10,23 +10,23 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: mull %ecx diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll index 7214ec75b0cf7..2fb4410567be6 100644 --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -20,9 +20,9 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r12 ; CHECK-NEXT: movq %r12, %r10 ; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; CHECK-NEXT: andq %rax, %r10 -; CHECK-NEXT: andq %rax, %r12 +; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: andq %rsi, %r12 ; CHECK-NEXT: shlq $4, %r12 ; CHECK-NEXT: orq %r10, %r12 ; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 @@ -36,14 +36,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r13, %rbp ; CHECK-NEXT: shrq %r12 ; CHECK-NEXT: andq %r13, %r12 -; CHECK-NEXT: leaq (%r12,%rbp,2), %rsi -; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq (%r12,%rbp,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r14 ; CHECK-NEXT: movq %r14, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: movq %rax, %rbp -; CHECK-NEXT: andq %rax, %r12 -; CHECK-NEXT: andq %rax, %r14 +; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: andq %rsi, %r14 ; CHECK-NEXT: shlq $4, %r14 ; CHECK-NEXT: orq %r12, %r14 ; CHECK-NEXT: movq %r14, %r12 @@ -61,8 +60,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: movq %r15, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rbp, %r12 -; CHECK-NEXT: andq %rbp, %r15 +; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: andq %rsi, %r15 ; CHECK-NEXT: shlq $4, %r15 ; CHECK-NEXT: orq %r12, %r15 ; CHECK-NEXT: movq %r15, %r12 @@ -79,8 +78,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: movq %rbx, %r15 ; CHECK-NEXT: shrq $4, %r15 -; CHECK-NEXT: andq %rbp, %r15 -; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: andq %rsi, %rbx ; CHECK-NEXT: shlq $4, %rbx ; CHECK-NEXT: orq %r15, %rbx ; CHECK-NEXT: movq %rbx, %r15 @@ -97,8 +96,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -116,8 +115,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -135,8 +134,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -154,8 +153,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -173,8 +172,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -192,8 +191,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -211,8 +210,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rax, %rdi ; CHECK-NEXT: movq %rdi, %rax @@ -224,13 +223,12 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rdi ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdi,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq (%rax,%rdi,2), %rdi ; CHECK-NEXT: bswapq %r9 ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %r9 +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r9 ; CHECK-NEXT: shlq $4, %r9 ; CHECK-NEXT: orq %rax, %r9 ; CHECK-NEXT: movq %r9, %rax @@ -247,8 +245,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r8 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %r8 +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r8 ; CHECK-NEXT: shlq $4, %r8 ; CHECK-NEXT: orq %rax, %r8 ; CHECK-NEXT: movq %r8, %rax @@ -260,12 +258,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %r8 ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r8,2), %r8 +; CHECK-NEXT: leaq (%rax,%r8,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rcx +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rcx ; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, %rax @@ -277,12 +276,12 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rcx ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rcx,2), %r12 +; CHECK-NEXT: leaq (%rax,%rcx,2), %rbx ; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rdx +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rdx ; CHECK-NEXT: shlq $4, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %rax @@ -294,28 +293,28 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rdx ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $4, %rcx -; CHECK-NEXT: andq %rbp, %rcx -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %r10, %rcx -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: bswapq %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shlq $4, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: andq %r10, %rax -; CHECK-NEXT: leaq (%rax,%rcx,4), %rax -; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: andq %r14, %r10 +; CHECK-NEXT: shrq $2, %rcx +; CHECK-NEXT: andq %r10, %rcx +; CHECK-NEXT: leaq (%rcx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: andq %r14, %rsi ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r10,2), %rdx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %rsi +; CHECK-NEXT: shrdq $24, %rax, %r10 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -324,48 +323,46 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r13, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r12, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r15, %r13 +; CHECK-NEXT: shrdq $24, %r15, %r12 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r14, %r15 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rbx, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r11, %rbx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r10, %r11 +; CHECK-NEXT: shrdq $24, %r11, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r9, %r10 +; CHECK-NEXT: shrdq $24, %r9, %r11 +; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: shrdq $24, %rdi, %r9 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rdi, %r8 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rcx, %r9 -; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: shrdq $24, %r8, %rcx -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: shrdq $24, %r12, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrdq $24, %rdi, %r12 -; CHECK-NEXT: shrdq $24, %rdx, %rdi +; CHECK-NEXT: shrdq $24, %rcx, %rdi +; CHECK-NEXT: shrdq $24, %rbx, %rcx +; CHECK-NEXT: shrdq $24, %rdx, %rbx +; CHECK-NEXT: shrdq $24, %rsi, %rdx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rdi, 112(%rax) -; CHECK-NEXT: movq %r12, 104(%rax) +; CHECK-NEXT: movq %rdx, 112(%rax) +; CHECK-NEXT: movq %rbx, 104(%rax) ; CHECK-NEXT: movq %rcx, 96(%rax) -; CHECK-NEXT: movq %r8, 88(%rax) -; CHECK-NEXT: movq %r9, 80(%rax) -; CHECK-NEXT: movq %r10, 72(%rax) +; CHECK-NEXT: movq %rdi, 88(%rax) +; CHECK-NEXT: movq %r8, 80(%rax) +; CHECK-NEXT: movq %r9, 72(%rax) ; CHECK-NEXT: movq %r11, 64(%rax) -; CHECK-NEXT: movq %rbx, 56(%rax) -; CHECK-NEXT: movq %r14, 48(%rax) -; CHECK-NEXT: movq %r15, 40(%rax) +; CHECK-NEXT: movq %r14, 56(%rax) +; CHECK-NEXT: movq %r15, 48(%rax) +; CHECK-NEXT: movq %r12, 40(%rax) ; CHECK-NEXT: movq %r13, 32(%rax) ; CHECK-NEXT: movq %rbp, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %rsi, (%rax) -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: shrq $56, %rdx -; CHECK-NEXT: movb %dl, 124(%rax) +; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: shrq $56, %rsi +; CHECK-NEXT: movb %sil, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll index 5ae953ab82ab4..ab454cfe470f3 100644 --- a/llvm/test/CodeGen/X86/pr46527.ll +++ b/llvm/test/CodeGen/X86/pr46527.ll @@ -7,11 +7,11 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) { ; CHECK-NEXT: calll .L0$pb ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: -; CHECK-NEXT: popl %ecx +; CHECK-NEXT: popl %eax ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 ; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: notb %dl ; CHECK-NEXT: andb $1, %dl @@ -22,8 +22,8 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) { ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; CHECK-NEXT: paddb %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%ecx), %xmm1 -; CHECK-NEXT: movdqa %xmm1, (%eax) +; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%ecx) ; CHECK-NEXT: retl entry: %0 = select i1 %flag, i8 0, i8 2 diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll index 798663c1d4dca..56618205ec7c1 100644 --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -5,200 +5,201 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16, float %17, float %18, float %19, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, float %28, float %29, float %30, float %31, float %32, float %33, float %34, float %35, float %36, float %37, float %38, float %39, float %40, float %41, float %42, float %43, float %44, float %45, float %46, float %47, float %48, float %49, float %50, float %51, float %52, float %53, float %54, float %55, float %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63, float %64, float %65, float %66, float %67, float %68, float %69, float %70, float %71, float %72, float %73, float %74, float %75, float %76, float %77, float %78, float %79, ptr %80) { ; CHECK-LABEL: tester: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps %xmm3, %xmm15 -; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps %xmm3, %xmm13 +; CHECK-NEXT: vmovaps %xmm1, %xmm14 +; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero ; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm5 -; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm2 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm2 ; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm14 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm11 * xmm4) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm4, %xmm4 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm5 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm8 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm8 -; CHECK-NEXT: vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmovaps %xmm5, %xmm10 -; CHECK-NEXT: vmulss %xmm14, %xmm8, %xmm5 +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovaps %xmm5, %xmm15 +; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm13 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm11 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm9 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm11 * xmm3) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm9 * xmm3) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm15 * xmm3) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm14, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm10 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm14 * xmm5) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm5 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm10 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm1 * xmm10) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm3 +; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm15 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm10 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm12 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm7 * xmm12) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm12 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm6 * xmm12) + xmm0 ; CHECK-NEXT: vmulss %xmm10, %xmm12, %xmm10 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm4, %xmm10, %xmm12 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm12 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm6 * xmm2) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm5, %xmm3, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1 +; CHECK-NEXT: vmulss %xmm3, %xmm15, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm8 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm4 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm3 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload ; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm2 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm6 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm6 * xmm11) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm5 * xmm9) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm11, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload -; CHECK-NEXT: # xmm14 = -(xmm14 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm6, %xmm13, %xmm7 -; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 4-byte Folded Reload -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm8 -; CHECK-NEXT: vmulss %xmm5, %xmm15, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm9, %xmm3 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 4-byte Folded Reload +; CHECK-NEXT: # xmm11 = -(xmm11 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm11, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm6 +; CHECK-NEXT: vmulss %xmm5, %xmm14, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm7 +; CHECK-NEXT: vmulss %xmm15, %xmm13, %xmm8 ; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm8 = -(xmm11 * xmm8) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm9 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm11 * xmm9) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm11 * xmm7) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm11 * xmm6) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm10 ; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm11 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm11, %xmm11 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm7 = (xmm13 * xmm7) - xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm8, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm7, %xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm1 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm7, %xmm1 ; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll index 6ae04d5ca2fdb..a6ae7ce5ccd15 100644 --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -13,8 +13,8 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu (%rax), %xmm6 -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vmovdqu (%rax), %xmm5 +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 @@ -26,16 +26,16 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vpextrw $0, %xmm3, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpsrld $16, %xmm6, %xmm4 +; CHECK-NEXT: vpsrld $16, %xmm5, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: setne %al ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: vcvtph2ps %xmm3, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm3, %xmm6 ; CHECK-NEXT: vcvtph2ps %xmm4, %xmm3 ; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: vucomiss %xmm5, %xmm3 +; CHECK-NEXT: vucomiss %xmm6, %xmm3 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -68,13 +68,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 -; CHECK-NEXT: vcvtph2ps %xmm4, %xmm5 -; CHECK-NEXT: vpsrlq $48, %xmm6, %xmm4 +; CHECK-NEXT: vcvtph2ps %xmm4, %xmm6 +; CHECK-NEXT: vpsrlq $48, %xmm5, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4 -; CHECK-NEXT: vucomiss %xmm5, %xmm4 +; CHECK-NEXT: vucomiss %xmm6, %xmm4 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -85,13 +85,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-17, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 -; CHECK-NEXT: vucomiss %xmm5, %xmm0 +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 +; CHECK-NEXT: vucomiss %xmm6, %xmm0 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -102,18 +102,18 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-33, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm7 +; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 -; CHECK-NEXT: vucomiss %xmm7, %xmm5 +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 +; CHECK-NEXT: vucomiss %xmm7, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -147,12 +147,12 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm7 ; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 -; CHECK-NEXT: vucomiss %xmm7, %xmm6 +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 +; CHECK-NEXT: vucomiss %xmm7, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -254,7 +254,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 -; CHECK-NEXT: vucomiss %xmm2, %xmm5 +; CHECK-NEXT: vucomiss %xmm2, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -286,7 +286,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NEXT: kshiftrw $1, %k0, %k0 -; CHECK-NEXT: vucomiss %xmm0, %xmm6 +; CHECK-NEXT: vucomiss %xmm0, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl diff --git a/llvm/test/CodeGen/X86/pr59258.ll b/llvm/test/CodeGen/X86/pr59258.ll index fb2d219556632..61ddb24eaaf87 100644 --- a/llvm/test/CodeGen/X86/pr59258.ll +++ b/llvm/test/CodeGen/X86/pr59258.ll @@ -4,7 +4,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-LABEL: cvt_and_clamp2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: subq $104, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -21,7 +21,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __truncsfhf2@PLT @@ -62,13 +62,13 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -110,7 +110,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movd (%rsp), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -157,7 +157,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: retq %2 = fptrunc <8 x float> %0 to <8 x half> %3 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> zeroinitializer, <8 x half> %2) diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 7858d125b9da4..d2da4c0f3e86a 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -840,8 +840,8 @@ vector.ph: define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: test14: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: pxor %xmm5, %xmm5 -; SSE2OR3-NEXT: movdqa %xmm0, %xmm6 +; SSE2OR3-NEXT: pxor %xmm6, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm5 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm9 @@ -856,27 +856,27 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: pxor %xmm5, %xmm7 -; SSE2OR3-NEXT: por %xmm5, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2OR3-NEXT: pxor %xmm5, %xmm8 -; SSE2OR3-NEXT: por %xmm5, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE2OR3-NEXT: movdqa %xmm5, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: pxor %xmm6, %xmm7 +; SSE2OR3-NEXT: por %xmm6, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2OR3-NEXT: pxor %xmm6, %xmm8 +; SSE2OR3-NEXT: por %xmm6, %xmm3 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2OR3-NEXT: packssdw %xmm6, %xmm3 -; SSE2OR3-NEXT: pxor %xmm5, %xmm9 -; SSE2OR3-NEXT: por %xmm5, %xmm2 +; SSE2OR3-NEXT: packssdw %xmm5, %xmm3 +; SSE2OR3-NEXT: pxor %xmm6, %xmm9 +; SSE2OR3-NEXT: por %xmm6, %xmm2 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2OR3-NEXT: pxor %xmm5, %xmm4 -; SSE2OR3-NEXT: por %xmm5, %xmm0 +; SSE2OR3-NEXT: pxor %xmm6, %xmm4 +; SSE2OR3-NEXT: por %xmm6, %xmm0 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE2OR3-NEXT: packssdw %xmm2, %xmm0 ; SSE2OR3-NEXT: packsswb %xmm3, %xmm0 @@ -1669,7 +1669,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 @@ -1677,8 +1677,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] @@ -1689,7 +1689,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm4 @@ -1700,7 +1700,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm3 @@ -1708,14 +1708,14 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: packusdw %xmm3, %xmm8 ; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: psubusw %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index e747692412034..336aa216d19b1 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -452,80 +452,80 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X64-NEXT: movq %xmm3, %rdi +; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %rdi +; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: pcmpgtd %xmm4, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-NEXT: movq %xmm4, %r9 +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: pxor %xmm5, %xmm5 ; X64-NEXT: pcmpgtd %xmm1, %xmm5 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; X64-NEXT: psllq $31, %xmm1 ; X64-NEXT: movq %xmm1, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %r9 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: idivq %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm4, %r11 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %r11 -; X64-NEXT: movq %rsi, %xmm4 ; X64-NEXT: movq %r8, %xmm5 -; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; X64-NEXT: pcmpeqd %xmm6, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; X64-NEXT: pand %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X64-NEXT: movq %r10, %xmm6 ; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; X64-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] +; X64-NEXT: pand %xmm5, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm2, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: pxor %xmm4, %xmm2 -; X64-NEXT: movq %rdi, %xmm4 -; X64-NEXT: pandn %xmm2, %xmm5 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; X64-NEXT: movdqa %xmm5, %xmm2 -; X64-NEXT: pandn %xmm0, %xmm2 -; X64-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-NEXT: paddq %xmm4, %xmm0 -; X64-NEXT: pand %xmm5, %xmm0 -; X64-NEXT: por %xmm2, %xmm0 -; X64-NEXT: movq %r10, %xmm2 -; X64-NEXT: movq %rdx, %xmm5 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; X64-NEXT: pcmpeqd %xmm6, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; X64-NEXT: pand %xmm2, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-NEXT: movq %rsi, %xmm0 +; X64-NEXT: pxor %xmm5, %xmm2 +; X64-NEXT: movq %rdi, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm6 +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; X64-NEXT: movdqa %xmm6, %xmm5 +; X64-NEXT: pandn %xmm0, %xmm5 +; X64-NEXT: pcmpeqd %xmm2, %xmm2 +; X64-NEXT: paddq %xmm2, %xmm0 +; X64-NEXT: pand %xmm6, %xmm0 +; X64-NEXT: por %xmm5, %xmm0 +; X64-NEXT: movq %r9, %xmm5 +; X64-NEXT: movq %rdx, %xmm6 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; X64-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] +; X64-NEXT: pand %xmm5, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm3, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pcmpgtd %xmm1, %xmm6 -; X64-NEXT: pxor %xmm3, %xmm6 -; X64-NEXT: pandn %xmm6, %xmm5 -; X64-NEXT: movq %r9, %xmm1 -; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: movdqa %xmm5, %xmm2 -; X64-NEXT: pandn %xmm1, %xmm2 -; X64-NEXT: paddq %xmm4, %xmm1 -; X64-NEXT: pand %xmm5, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: pandn %xmm4, %xmm6 +; X64-NEXT: movq %rcx, %xmm1 +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-NEXT: movdqa %xmm6, %xmm3 +; X64-NEXT: pandn %xmm1, %xmm3 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: pand %xmm6, %xmm1 +; X64-NEXT: por %xmm3, %xmm1 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index a308f85573c7f..371484e01556c 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -373,33 +373,32 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: subl $88, %esp ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: shldl $31, %ecx, %eax -; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -407,12 +406,13 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: subl $1, %esi ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx @@ -420,12 +420,12 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: xorb %al, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -440,14 +440,16 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sbbl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx @@ -900,28 +902,27 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %esi +; X86-NEXT: leal (%ecx,%ecx), %edx ; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %esi, %ecx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shldl $31, %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi -; X86-NEXT: pushl $0 ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %edx ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -929,7 +930,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -943,16 +943,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -962,37 +962,37 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: orl %edi, %esi ; X86-NEXT: setne %bl ; X86-NEXT: testb %bh, %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: cmovgel %edi, %ebx -; X86-NEXT: cmovgel %edi, %edx -; X86-NEXT: cmovgel %edi, %ecx +; X86-NEXT: cmovgel %ebx, %edx +; X86-NEXT: cmovgel %ebx, %ecx +; X86-NEXT: cmovgel %ebx, %edi ; X86-NEXT: movl $-1, %esi ; X86-NEXT: cmovgel %esi, %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: negl %esi ; X86-NEXT: movl $-1, %esi -; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: sbbl %edi, %esi ; X86-NEXT: movl $-1, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: cmovgel %ebx, %eax ; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovgel %edx, %ecx -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovgel %edx, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -1000,12 +1000,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -1013,42 +1013,42 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: setne %cl ; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: cmovgel %ecx, %edx ; X86-NEXT: cmovgel %ecx, %edi +; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: movl $-1, %ebx ; X86-NEXT: cmovgel %ebx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: negl %ecx ; X86-NEXT: movl $-1, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %ebx, %edi -; X86-NEXT: shldl $31, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovgel %ebx, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -1056,12 +1056,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -1069,40 +1069,40 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: setne %cl ; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: cmovgel %ecx, %edx +; X86-NEXT: cmovgel %ecx, %edi ; X86-NEXT: cmovgel %ecx, %ebx -; X86-NEXT: movl $-1, %edi -; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovgel %esi, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: negl %ecx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %edi, %ebx +; X86-NEXT: cmovgel %esi, %ebx ; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index fbdec48c978f8..213b2b018d0ad 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -508,43 +508,43 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi ; ATHLON-NEXT: pushl %edi ; ATHLON-NEXT: pushl %esi ; ATHLON-NEXT: testb $1, {{[0-9]+}}(%esp) -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: cmovnel %ecx, %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx -; ATHLON-NEXT: cmovnel %edx, %ecx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi -; ATHLON-NEXT: cmovnel %edx, %esi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi -; ATHLON-NEXT: cmovnel %edx, %edi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebx -; ATHLON-NEXT: cmovnel %edx, %ebx +; ATHLON-NEXT: cmovnel %eax, %ebx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi +; ATHLON-NEXT: cmovnel %eax, %edi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi +; ATHLON-NEXT: cmovnel %eax, %esi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx +; ATHLON-NEXT: cmovnel %eax, %edx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx +; ATHLON-NEXT: cmovnel %eax, %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebp -; ATHLON-NEXT: cmovnel %edx, %ebp -; ATHLON-NEXT: movl (%eax), %eax -; ATHLON-NEXT: movl (%ecx), %ecx -; ATHLON-NEXT: movl (%esi), %edx -; ATHLON-NEXT: movl (%edi), %esi -; ATHLON-NEXT: movl (%ebx), %ebx -; ATHLON-NEXT: movl (%ebp), %edi -; ATHLON-NEXT: decl %eax -; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ebp -; ATHLON-NEXT: movl %eax, 20(%ebp) -; ATHLON-NEXT: decl %ecx -; ATHLON-NEXT: movl %ecx, 16(%ebp) -; ATHLON-NEXT: decl %edx -; ATHLON-NEXT: movl %edx, 12(%ebp) -; ATHLON-NEXT: decl %esi -; ATHLON-NEXT: movl %esi, 8(%ebp) +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %ebp, %eax +; ATHLON-NEXT: movl (%ebx), %ebp +; ATHLON-NEXT: movl (%edi), %ebx +; ATHLON-NEXT: movl (%esi), %edi +; ATHLON-NEXT: movl (%edx), %esi +; ATHLON-NEXT: movl (%ecx), %edx +; ATHLON-NEXT: movl (%eax), %ecx +; ATHLON-NEXT: decl %ebp +; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: movl %ebp, 20(%eax) ; ATHLON-NEXT: decl %ebx -; ATHLON-NEXT: movl %ebx, 4(%ebp) +; ATHLON-NEXT: movl %ebx, 16(%eax) ; ATHLON-NEXT: decl %edi -; ATHLON-NEXT: movl %edi, (%ebp) +; ATHLON-NEXT: movl %edi, 12(%eax) +; ATHLON-NEXT: decl %esi +; ATHLON-NEXT: movl %esi, 8(%eax) +; ATHLON-NEXT: decl %edx +; ATHLON-NEXT: movl %edx, 4(%eax) +; ATHLON-NEXT: decl %ecx +; ATHLON-NEXT: movl %ecx, (%eax) ; ATHLON-NEXT: popl %esi ; ATHLON-NEXT: popl %edi ; ATHLON-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index ae0b010a1e594..65e3c1f0633d7 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -571,50 +571,50 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind { ; SSE-LABEL: PR63946: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,2,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,2,3,0] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,2,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,2,3,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,0,1,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,0,1,2] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,0,1,2] +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pcmpeqd %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pcmpeqd %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pcmpeqd %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pcmpeqd %xmm4, %xmm14 ; SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pcmpeqd %xmm4, %xmm10 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE-NEXT: pcmpeqd %xmm13, %xmm12 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pcmpeqd %xmm13, %xmm10 +; SSE-NEXT: pcmpeqd %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm2 ; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: por %xmm15, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE-NEXT: pcmpeqd %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: pcmpeqd %xmm13, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pcmpeqd %xmm13, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 ; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: packssdw %xmm9, %xmm5 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: packssdw %xmm10, %xmm1 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: packssdw %xmm8, %xmm5 +; SSE-NEXT: pcmpeqd %xmm13, %xmm1 +; SSE-NEXT: packssdw %xmm6, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index 3cb680396b6ba..f6d73b1fbc6e7 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -168,22 +168,20 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind { define i64 @t6(i64 %key, ptr nocapture %val) nounwind { ; X32-LABEL: t6: ; X32: # %bb.0: -; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrdl $3, %eax, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: shrl $3, %edi -; X32-NEXT: movl (%ecx), %eax -; X32-NEXT: movl 4(%ecx), %edx +; X32-NEXT: shrdl $3, %eax, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $3, %esi +; X32-NEXT: movl (%edx), %eax +; X32-NEXT: movl 4(%edx), %edx ; X32-NEXT: addl $-1, %eax ; X32-NEXT: adcl $-1, %edx -; X32-NEXT: andl %esi, %eax -; X32-NEXT: andl %edi, %edx +; X32-NEXT: andl %ecx, %eax +; X32-NEXT: andl %esi, %edx ; X32-NEXT: popl %esi -; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: t6: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index aefc4df882c7d..1fe8d834dbcdd 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -160,12 +160,12 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: subl $32, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -177,29 +177,29 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: andb $15, %cl ; i686-NEXT: negb %cl ; i686-NEXT: movsbl %cl, %ebp -; i686-NEXT: movl 24(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %ebx +; i686-NEXT: movl 24(%esp,%ebp), %ebx +; i686-NEXT: movl %ebx, %edx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %ebx +; i686-NEXT: shll %cl, %edx ; i686-NEXT: notb %cl ; i686-NEXT: movl 20(%esp,%ebp), %edi ; i686-NEXT: movl %edi, %esi ; i686-NEXT: shrl %esi ; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %ebx, %esi -; i686-NEXT: movl 16(%esp,%ebp), %ebx +; i686-NEXT: orl %edx, %esi +; i686-NEXT: movl 16(%esp,%ebp), %edx ; i686-NEXT: movl 28(%esp,%ebp), %ebp ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shldl %cl, %edx, %ebp +; i686-NEXT: shldl %cl, %ebx, %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movl %edx, %ebx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: shldl %cl, %ebx, %edi +; i686-NEXT: shll %cl, %ebx +; i686-NEXT: shldl %cl, %edx, %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: movl %edx, (%eax) +; i686-NEXT: movl %ebx, (%eax) ; i686-NEXT: movl %esi, 8(%eax) ; i686-NEXT: addl $32, %esp ; i686-NEXT: popl %esi @@ -407,8 +407,8 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $92, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -435,76 +435,75 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebp, %ebx +; i686-NEXT: movl %edi, %ebx ; i686-NEXT: andl $7, %ebx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp -; i686-NEXT: movl 32(%esp,%ebp), %eax +; i686-NEXT: shrl $3, %edi +; i686-NEXT: andl $15, %edi +; i686-NEXT: movl 32(%esp,%edi), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: shrl %cl, %eax ; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 36(%esp,%ebp), %edx +; i686-NEXT: movl 36(%esp,%edi), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: addl %edx, %edx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shll %cl, %edx ; i686-NEXT: orl %eax, %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %edi, %ecx -; i686-NEXT: movl %edi, %edx +; i686-NEXT: movl %ebp, %eax +; i686-NEXT: movl %ebp, %edx ; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %ecx -; i686-NEXT: andl $15, %ecx -; i686-NEXT: movl 64(%esp,%ecx), %esi -; i686-NEXT: movl %ecx, %edi -; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: movl 64(%esp,%eax), %ebp +; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %eax, (%esp) # 4-byte Spill ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %esi +; i686-NEXT: shrl %cl, %ebp ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 68(%esp,%edi), %eax -; i686-NEXT: leal (%eax,%eax), %edi +; i686-NEXT: movl 68(%esp,%eax), %esi +; i686-NEXT: leal (%esi,%esi), %eax ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl 28(%esp,%ebp), %ecx +; i686-NEXT: shll %cl, %eax +; i686-NEXT: orl %ebp, %eax +; i686-NEXT: movl 28(%esp,%edi), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 40(%esp,%ebp), %esi +; i686-NEXT: movl 40(%esp,%edi), %edi ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: movl (%esp), %ecx # 4-byte Reload ; i686-NEXT: movl 60(%esp,%ecx), %ebp ; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 72(%esp,%ecx), %ebp ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrdl %cl, %ebp, %eax -; i686-NEXT: movl %eax, (%esp) # 4-byte Spill +; i686-NEXT: shrdl %cl, %ebp, %esi +; i686-NEXT: movl %esi, (%esp) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: sarl %cl, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: sarl %cl, %edi ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, %ebx +; i686-NEXT: shrdl %cl, %esi, %ebx ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: sarl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 28(%eax) -; i686-NEXT: movl (%esp), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 24(%eax) -; i686-NEXT: movl %ebx, 16(%eax) -; i686-NEXT: movl %esi, 12(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 8(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, (%eax) -; i686-NEXT: movl %edi, 20(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 4(%eax) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movl %ebp, 28(%ecx) +; i686-NEXT: movl (%esp), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 24(%ecx) +; i686-NEXT: movl %ebx, 16(%ecx) +; i686-NEXT: movl %edi, 12(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 8(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, (%ecx) +; i686-NEXT: movl %eax, 20(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 4(%ecx) ; i686-NEXT: addl $92, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index 2f4071530382b..0e4e706669300 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -5,6 +5,191 @@ ; CHECK-LABEL: shift1 define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { +; CHECK-LABEL: shift1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $92, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $31, %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: andb $7, %al +; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movzbl %cl, %ebp +; CHECK-NEXT: movl 32(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: notb %dl +; CHECK-NEXT: movl 36(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 40(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl 44(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 48(%esp,%ebp), %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %ebx +; CHECK-NEXT: movl 52(%esp,%ebp), %edi +; CHECK-NEXT: leal (%edi,%edi), %esi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %esi +; CHECK-NEXT: orl %ebx, %esi +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill +; CHECK-NEXT: movl 28(%esp,%ebp), %edx +; CHECK-NEXT: movl 56(%esp,%ebp), %ebx +; CHECK-NEXT: shrdl %cl, %ebx, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %ebp, %edx +; CHECK-NEXT: sarl %cl, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebx, 28(%eax) +; CHECK-NEXT: movl %edi, 24(%eax) +; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 16(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: movl %esi, 20(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 12(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 4(%eax) +; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +; +; CHECK-X64-O0-LABEL: shift1: +; CHECK-X64-O0: # %bb.0: # %entry +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: sarq $63, %rcx +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movb %r8b, %dl +; CHECK-X64-O0-NEXT: movb %dl, %cl +; CHECK-X64-O0-NEXT: andb $7, %cl +; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-X64-O0-NEXT: shrb $3, %dl +; CHECK-X64-O0-NEXT: movzbl %dl, %edx +; CHECK-X64-O0-NEXT: movl %edx, %edi +; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx +; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8 +; CHECK-X64-O0-NEXT: movq %r8, %r9 +; CHECK-X64-O0-NEXT: shrq %cl, %r9 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: notb %cl +; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi +; CHECK-X64-O0-NEXT: movq %rsi, %r10 +; CHECK-X64-O0-NEXT: addq %r10, %r10 +; CHECK-X64-O0-NEXT: shlq %cl, %r10 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: orq %r10, %r9 +; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi +; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: sarq %cl, %rdi +; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax) +; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax) +; CHECK-X64-O0-NEXT: movq %rdx, (%rax) +; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax) +; CHECK-X64-O0-NEXT: retq +; +; CHECK-X64-O2-LABEL: shift1: +; CHECK-X64-O2: # %bb.0: # %entry +; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: sarq $63, %rcx +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movl %r8d, %eax +; CHECK-X64-O2-NEXT: andb $7, %al +; CHECK-X64-O2-NEXT: shrb $3, %r8b +; CHECK-X64-O2-NEXT: movzbl %r8b, %edx +; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi +; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi +; CHECK-X64-O2-NEXT: movq %rdi, %r8 +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrq %cl, %r8 +; CHECK-X64-O2-NEXT: notb %cl +; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10 +; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11 +; CHECK-X64-O2-NEXT: shlq %cl, %r11 +; CHECK-X64-O2-NEXT: orq %r8, %r11 +; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10 +; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O2-NEXT: sarq %cl, %rdx +; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9) +; CHECK-X64-O2-NEXT: movq %r10, 16(%r9) +; CHECK-X64-O2-NEXT: movq %rsi, (%r9) +; CHECK-X64-O2-NEXT: movq %r11, 8(%r9) +; CHECK-X64-O2-NEXT: retq entry: %0 = ashr i256 %x, %a store i256 %0, ptr %r @@ -47,12 +232,12 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: shll %cl, %edx ; CHECK-NEXT: notb %cl ; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; CHECK-NEXT: movl 64(%esp,%eax), %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shrl %edi -; CHECK-NEXT: shrl %cl, %edi -; CHECK-NEXT: orl %edx, %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 64(%esp,%eax), %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: shrl %ebp +; CHECK-NEXT: shrl %cl, %ebp +; CHECK-NEXT: orl %edx, %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 76(%esp,%eax), %edx ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movb %ch, %cl @@ -67,8 +252,8 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movb %ch, %cl ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl 80(%esp,%eax), %ebp -; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: movl 80(%esp,%eax), %edi +; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shrl %edx ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload ; CHECK-NEXT: shrl %cl, %edx @@ -77,21 +262,21 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; CHECK-NEXT: shldl %cl, %esi, %ebp -; CHECK-NEXT: movl 60(%esp,%eax), %edi +; CHECK-NEXT: shldl %cl, %esi, %edi +; CHECK-NEXT: movl 60(%esp,%eax), %ebp ; CHECK-NEXT: movl 88(%esp,%eax), %esi ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: shldl %cl, %eax, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %esi, 28(%eax) -; CHECK-NEXT: movl %ebp, 20(%eax) +; CHECK-NEXT: movl %edi, 20(%eax) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: movl %esi, 12(%eax) -; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: movl %ebp, %esi ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; CHECK-NEXT: shldl %cl, %edi, %ebp -; CHECK-NEXT: movl %ebp, 4(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: shldl %cl, %ebp, %edi +; CHECK-NEXT: movl %edi, 4(%eax) ; CHECK-NEXT: movl %esi, (%eax) ; CHECK-NEXT: movl %edx, 24(%eax) ; CHECK-NEXT: movl %ebx, 16(%eax) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index cf41a91737d88..524ecf2aece7e 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1989,10 +1989,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: pxor %xmm4, %xmm4 ; X86-SSE-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %esi -; X86-SSE-NEXT: pextrw $0, %xmm3, %edi -; X86-SSE-NEXT: pextrw $1, %xmm3, %ebx -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebp +; X86-SSE-NEXT: pextrw $4, %xmm3, %edi +; X86-SSE-NEXT: pextrw $0, %xmm3, %ebp +; X86-SSE-NEXT: pextrw $1, %xmm3, %esi +; X86-SSE-NEXT: pextrw $3, %xmm3, %ebx ; X86-SSE-NEXT: movdqa %xmm3, %xmm5 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] @@ -2009,10 +2009,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm4 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X86-SSE-NEXT: movl %esi, %eax +; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: divl 16(%esi) +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: divl 16(%edi) ; X86-SSE-NEXT: movd %edx, %xmm3 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X86-SSE-NEXT: movd %xmm2, %eax @@ -2023,20 +2023,20 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X86-SSE-NEXT: movl %edi, %eax +; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl (%esi) +; X86-SSE-NEXT: divl (%edi) ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %ebx, %eax +; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm2 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %ebp, %eax +; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm2 @@ -2051,24 +2051,24 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl 32(%esi) -; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X86-SSE-NEXT: divl 32(%edi) +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-SSE-NEXT: movl %eax, (%eax) ; X86-SSE-NEXT: movdqa %xmm3, (%eax) -; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: movdqa %xmm0, (%eax) ; X86-SSE-NEXT: addl $4, %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %edi @@ -2210,10 +2210,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: pxor %xmm4, %xmm4 ; X64-SSE-NEXT: movdqa %xmm3, %xmm2 ; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %edi -; X64-SSE-NEXT: pextrw $0, %xmm3, %r8d -; X64-SSE-NEXT: pextrw $1, %xmm3, %r9d -; X64-SSE-NEXT: pextrw $3, %xmm3, %r10d +; X64-SSE-NEXT: pextrw $4, %xmm3, %r8d +; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d +; X64-SSE-NEXT: pextrw $1, %xmm3, %edi +; X64-SSE-NEXT: pextrw $3, %xmm3, %r9d ; X64-SSE-NEXT: movdqa %xmm3, %xmm5 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] @@ -2230,33 +2230,33 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: divl %r11d ; X64-SSE-NEXT: movd %edx, %xmm4 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-SSE-NEXT: movl %edi, %eax +; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) ; X64-SSE-NEXT: movd %edx, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X64-SSE-NEXT: movd %xmm2, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-SSE-NEXT: movd %xmm1, %edi +; X64-SSE-NEXT: movd %xmm1, %r8d ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r8d ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r9d, %eax +; X64-SSE-NEXT: movd %xmm2, %r8d +; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r8d ; X64-SSE-NEXT: movd %edx, %xmm2 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r10d, %eax +; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm2 diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll index d6906b573981a..2d59422953eb3 100644 --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -385,26 +385,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovgl %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovgl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovgl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -721,17 +721,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: cmovll %edx, %ecx -; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll index 2b059557cdfb5..bde61d5738ed5 100644 --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -386,26 +386,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovll %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovll %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -722,17 +722,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovll %edx, %ecx -; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll index fbdb6e703fefd..0e17af441d649 100644 --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -196,110 +196,81 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl $1, %ebp +; X86-NEXT: negl %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl $1, %eax -; X86-NEXT: negl %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl %bl, %esi ; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %ebp +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: setb %al -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -308,103 +279,127 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -414,248 +409,248 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %ebp -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movzbl %cl, %ebx -; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: setb %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edx +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: setb %cl -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %al +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb %cl -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %al +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %eax, %eax -; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -663,113 +658,111 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: addl %edx, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %esi, %ebx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: setb %bl +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, %edi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ebp ; X86-NEXT: xorl %edx, %eax -; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %ebp, %eax ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: xorl %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -778,13 +771,12 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: xorl %edx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: movl %edi, %esi ; X86-NEXT: xorl %edx, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: xorl %edx, %edi ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %edi, %edx ; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %ebp, %edx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %ecx @@ -828,170 +820,169 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r13 -; X64-NEXT: movq %rcx, %r10 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: andl $1, %r11d ; X64-NEXT: negq %r11 -; X64-NEXT: andl $1, %r10d -; X64-NEXT: negq %r10 -; X64-NEXT: movq %r10, %rax +; X64-NEXT: andl $1, %r9d +; X64-NEXT: negq %r9 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rax, %rdi -; X64-NEXT: adcq %rdx, %r8 -; X64-NEXT: setb %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: addq %rax, %r8 +; X64-NEXT: addq %rax, %rbp ; X64-NEXT: adcq %rdx, %rcx -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: setb %sil +; X64-NEXT: movzbl %sil, %edi +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: adcq %rdx, %rdi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbp, %rax +; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r13, %rsi -; X64-NEXT: setb %bpl -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %bpl, %edx -; X64-NEXT: adcq %rdx, %r14 -; X64-NEXT: addq %r12, %rax -; X64-NEXT: movq %r12, %r9 +; X64-NEXT: movzbl %r8b, %edx +; X64-NEXT: adcq %rdx, %rbx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rdi, %r14 -; X64-NEXT: adcq $0, %r8 +; X64-NEXT: adcq %rbp, %rbx ; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r13, %r15 +; X64-NEXT: addq %r13, %r14 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rbx, %r15 +; X64-NEXT: addq %r15, %r14 ; X64-NEXT: adcq %r13, %rbp ; X64-NEXT: setb %al -; X64-NEXT: addq %rdi, %rbp +; X64-NEXT: addq %r8, %rbp ; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: adcq %rdx, %r12 -; X64-NEXT: addq %rbx, %rsi +; X64-NEXT: addq %r15, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %r15 +; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: addq %r8, %rbp -; X64-NEXT: adcq %rcx, %r12 +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: adcq %rdi, %r12 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: addq %rax, %r8 -; X64-NEXT: adcq %rdx, %r14 -; X64-NEXT: setb %dil -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: movzbl %dil, %esi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %rdx, %rdi +; X64-NEXT: setb %bl +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: movzbl %bl, %esi ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: addq %rax, %rbp -; X64-NEXT: adcq %r12, %r8 +; X64-NEXT: adcq %r12, %r10 ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %rsi, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, %rdi +; X64-NEXT: addq %rax, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r9, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: addq %rbx, %r8 ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: setb %al ; X64-NEXT: addq %rsi, %rcx ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: adcq %rdx, %rsi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: imulq %r11 -; X64-NEXT: movq %r9, %r11 +; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %r8, %r12 ; X64-NEXT: adcq %rdx, %r12 ; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq %rsi, %r12 -; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: addq %r13, %r10 +; X64-NEXT: movq %r15, %r9 +; X64-NEXT: addq %r13, %r9 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: setb %r9b +; X64-NEXT: setb %bl ; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %r9b, %ecx +; X64-NEXT: movzbl %bl, %ecx ; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r10, %rdx +; X64-NEXT: addq %r15, %rax +; X64-NEXT: adcq %r9, %rdx ; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: adcq %r8, %r9 ; X64-NEXT: adcq %r11, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: adcq %r14, %rax +; X64-NEXT: addq %rbp, %r15 +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: adcq %rdi, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: xorq %rcx, %rdx -; X64-NEXT: xorq %rcx, %r10 -; X64-NEXT: orq %rdx, %r10 +; X64-NEXT: xorq %rcx, %r9 +; X64-NEXT: orq %rdx, %r9 ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rbx, %rcx +; X64-NEXT: xorq %r15, %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: orq %r10, %rcx +; X64-NEXT: orq %r9, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movl %eax, %esi ; X64-NEXT: andl $1, %esi ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: negq %rdx -; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: xorq %rdx, %r14 ; X64-NEXT: xorq %rax, %rdx -; X64-NEXT: orq %r15, %rdx +; X64-NEXT: orq %r14, %rdx ; X64-NEXT: orq %rcx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index 8c2b945d6a8ce..ce56283df6010 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -56,28 +56,28 @@ define i64 @func2(i64 %x, i64 %y) { ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebx, %edi ; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl %ecx, %edi -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovsl %edi, %edx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: shldl $30, %esi, %eax diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 75fc1d34fa1cb..85c966c447fad 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -60,61 +60,61 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %edi, %eax -; X86-NEXT: imull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %ebp -; X86-NEXT: cmovnsl %edx, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: cmovnsl %esi, %ebp +; X86-NEXT: cmovnsl %edx, %edi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebp, %edx ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %ebp, %edx -; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: cmovnsl %edi, %ecx ; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %bl -; X86-NEXT: sete %bh +; X86-NEXT: setg %ah +; X86-NEXT: sete (%esp) # 1-byte Folded Spill ; X86-NEXT: cmpl $2, %ecx ; X86-NEXT: setae %al -; X86-NEXT: andb %bh, %al -; X86-NEXT: orb %bl, %al -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: shrdl $2, %edi, %ebx -; X86-NEXT: shrdl $2, %ecx, %edi +; X86-NEXT: andb (%esp), %al # 1-byte Folded Reload +; X86-NEXT: orb %ah, %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shrdl $2, %ebx, %ebp +; X86-NEXT: shrdl $2, %ecx, %ebx ; X86-NEXT: testb %al, %al ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmovel %ebx, %esi ; X86-NEXT: movl $-1, %edi -; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: cmovel %ebp, %edi ; X86-NEXT: cmpl $-1, %edx ; X86-NEXT: setl %dl ; X86-NEXT: sete %al @@ -126,7 +126,7 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: cmovel %edi, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: addl $4, %esp +; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -376,11 +376,12 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: imull %edx, %ebx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi @@ -398,44 +399,45 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %esi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: orl %edx, %ebx ; X86-NEXT: notl %ecx ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel %ebp, %esi +; X86-NEXT: cmovel %esi, %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi @@ -627,32 +629,31 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl %esi, %ecx -; X86-NEXT: movl %edi, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %edi, %esi +; X86-NEXT: cmovnsl %ebx, %esi ; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index 3c4bb043800b7..abab313f4b12e 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -89,183 +89,180 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %esi +; X86-NEXT: addl %eax, %edi ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: imull %ecx, %eax @@ -277,18 +274,18 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: movl %edx, %ecx @@ -296,9 +293,9 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: xorl %ecx, %ebp ; X86-NEXT: orl %eax, %ebp -; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, 12(%eax) @@ -344,183 +341,184 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 ; X64-NEXT: movq %rcx, %r13 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %r11 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rsi, %r10 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %r13, %rax ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r9, %rcx ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rbx, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r14, %rbx ; X64-NEXT: adcq %rsi, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq %r10, %rcx +; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: addq %r9, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %rbx -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: adcq %r12, %r9 +; X64-NEXT: setb %dil +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: addq %r15, %rbp -; X64-NEXT: adcq %r14, %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: addq %r11, %rbp +; X64-NEXT: adcq %rbx, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, %r14 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %r9, %r10 +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: adcq %r9, %r11 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq %r11, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r11, %r9 +; X64-NEXT: addq %rdi, %r9 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbp, %r15 +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r13, %r15 ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r13, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: adcq %rcx, %r11 ; X64-NEXT: setb %bl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r11, %r15 +; X64-NEXT: addq %r10, %r15 ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rdi, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: addq %r9, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r11, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r14, %rdi -; X64-NEXT: movq %r14, %rbp +; X64-NEXT: movq %rbp, %rdi ; X64-NEXT: sarq $63, %rdi ; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r9, %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r9, %r10 -; X64-NEXT: setb %cl -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: imulq %r12, %rsi +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: setb %sil +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: imulq %r12, %r8 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq {{[0-9]+}}(%rsp) -; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: addq %r8, %rdx ; X64-NEXT: addq %rax, %rdx ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r8, %rdx -; X64-NEXT: addq %r11, %r10 -; X64-NEXT: movzbl %cl, %esi -; X64-NEXT: adcq %r14, %rsi -; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: addq %r14, %r9 +; X64-NEXT: movzbl %sil, %esi +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: addq %rax, %r9 ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: adcq %rdx, %rdi ; X64-NEXT: setb %bl ; X64-NEXT: imulq %r12, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload @@ -529,28 +527,28 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: addq %rbp, %rdx ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: adcq %r14, %rdx -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: movzbl %bl, %r9d -; X64-NEXT: adcq %rdi, %r9 -; X64-NEXT: addq %rax, %r11 -; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: addq %r10, %rdi +; X64-NEXT: movzbl %bl, %r10d +; X64-NEXT: adcq %r8, %r10 +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: adcq %rdx, %r10 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq %r8, %r14 -; X64-NEXT: adcq %r10, %r11 -; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: adcq %r15, %rdi +; X64-NEXT: adcq %r13, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: xorq %rax, %r9 +; X64-NEXT: xorq %rax, %r10 ; X64-NEXT: xorq %rax, %r14 -; X64-NEXT: orq %r9, %r14 -; X64-NEXT: xorq %rax, %r11 +; X64-NEXT: orq %r10, %r14 +; X64-NEXT: xorq %rax, %rdi ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: orq %r11, %rax +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: orq %r14, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rdx, 24(%rax) @@ -585,548 +583,532 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %bl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl (%esp), %edi ## 4-byte Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl (%esp), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill @@ -1135,8 +1117,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx @@ -1147,11 +1129,10 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %ecx, %eax @@ -1161,37 +1142,36 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %edi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %al, %edx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl %edx, %eax @@ -1208,7 +1188,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: addl %ebp, %edi ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax @@ -1218,250 +1198,248 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, %edx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %eax, %edx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movzbl (%esp), %ebp ## 1-byte Folded Reload +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %esi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: adcl %edi, (%esp) ## 4-byte Folded Spill +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: setb %al ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: setb %al +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %edx, %eax +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebp, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: xorl %eax, %edx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: orl %edx, %edi ; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: xorl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: xorl %eax, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: orl %ecx, %esi ; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: xorl %eax, %edi ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax ; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %esi, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 28(%eax) diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index c07c20d8f414a..c8c1026bdaf3f 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -175,6 +175,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_power_of_two: ; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: leal 31(%rax), %ecx ; SSE-NEXT: testw %ax, %ax @@ -187,16 +188,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: cmovnsl %ecx, %edx ; SSE-NEXT: andl $-64, %edx ; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: pinsrw $1, %eax, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax ; SSE-NEXT: leal 7(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx ; SSE-NEXT: andl $-8, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 ; SSE-NEXT: shrl $16, %ecx @@ -208,8 +209,7 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $95, %edx, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pinsrw $3, %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_power_of_two: @@ -257,32 +257,32 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %eax +; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 +; SSE-NEXT: shrl $16, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: movzwl %ax, %edx +; SSE-NEXT: movswl %dx, %eax +; SSE-NEXT: shrl $15, %edx +; SSE-NEXT: sarl $4, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: leal (%rax,%rax,2), %edx +; SSE-NEXT: shll $3, %edx +; SSE-NEXT: subl %edx, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %ecx ; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi +; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B +; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: sarl $23, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E +; SSE-NEXT: subl %edx, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index 0226052402cb8..6f0293392eef2 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -77,35 +77,35 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, ; WIN32-NEXT: movaps %xmm7, (%esp) # 16-byte Spill ; WIN32-NEXT: movaps %xmm6, %xmm7 ; WIN32-NEXT: movaps %xmm5, %xmm6 -; WIN32-NEXT: movaps %xmm3, %xmm5 -; WIN32-NEXT: movaps %xmm2, %xmm3 -; WIN32-NEXT: movaps %xmm1, %xmm2 +; WIN32-NEXT: movaps %xmm4, %xmm5 +; WIN32-NEXT: movaps %xmm1, %xmm4 ; WIN32-NEXT: movaps %xmm0, %xmm1 -; WIN32-NEXT: addps %xmm4, %xmm0 -; WIN32-NEXT: mulps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm0 +; WIN32-NEXT: mulps %xmm5, %xmm1 ; WIN32-NEXT: subps %xmm1, %xmm0 ; WIN32-NEXT: movups 8(%ebp), %xmm1 ; WIN32-NEXT: addps %xmm1, %xmm0 +; WIN32-NEXT: movaps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: mulps %xmm6, %xmm4 +; WIN32-NEXT: subps %xmm4, %xmm1 +; WIN32-NEXT: movups 24(%ebp), %xmm4 +; WIN32-NEXT: addps %xmm4, %xmm1 ; WIN32-NEXT: movaps %xmm2, %xmm4 -; WIN32-NEXT: addps %xmm6, %xmm4 -; WIN32-NEXT: mulps %xmm6, %xmm2 +; WIN32-NEXT: addps %xmm7, %xmm4 +; WIN32-NEXT: mulps %xmm7, %xmm2 ; WIN32-NEXT: subps %xmm2, %xmm4 -; WIN32-NEXT: movups 24(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm4 -; WIN32-NEXT: movaps %xmm3, %xmm2 -; WIN32-NEXT: addps %xmm7, %xmm2 -; WIN32-NEXT: mulps %xmm7, %xmm3 -; WIN32-NEXT: subps %xmm3, %xmm2 -; WIN32-NEXT: movups 40(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm2 +; WIN32-NEXT: movups 40(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm4 +; WIN32-NEXT: movaps %xmm3, %xmm5 +; WIN32-NEXT: movaps (%esp), %xmm2 # 16-byte Reload +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: mulps %xmm2, %xmm3 +; WIN32-NEXT: subps %xmm3, %xmm5 +; WIN32-NEXT: movups 56(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: movaps %xmm4, %xmm2 ; WIN32-NEXT: movaps %xmm5, %xmm3 -; WIN32-NEXT: movaps (%esp), %xmm1 # 16-byte Reload -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: mulps %xmm1, %xmm5 -; WIN32-NEXT: subps %xmm5, %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: movaps %xmm4, %xmm1 ; WIN32-NEXT: movl %ebp, %esp ; WIN32-NEXT: popl %ebp ; WIN32-NEXT: retl @@ -198,44 +198,43 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edi, %esi -; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: leal (%edx,%esi), %eax +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: leal (%edx,%edi), %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: subl %esi, %ebx -; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: subl %ecx, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %eax, %ecx +; WIN32-NEXT: movl %edx, %eax +; WIN32-NEXT: subl %edi, %eax +; WIN32-NEXT: movl %ebp, %edx +; WIN32-NEXT: subl %ecx, %edx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: imull %edx, %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %eax, %edx +; WIN32-NEXT: addl %ebx, %edx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl (%esp), %edi # 4-byte Reload +; WIN32-NEXT: subl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %ebx, %eax -; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: imull %edi, %eax +; WIN32-NEXT: addl %edx, %eax +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: imull %ebp, %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %edx, %ebp ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %eax, %edx -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: addl %ecx, %edi -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull %ebx, %ecx +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: addl %eax, %ebp +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %ebx ; WIN32-NEXT: popl %ebp @@ -243,6 +242,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -255,35 +255,36 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r10d, %r9d -; WIN64-NEXT: movl %eax, %r10d -; WIN64-NEXT: subl %ecx, %r10d -; WIN64-NEXT: imull %r10d, %r9d -; WIN64-NEXT: leal (%r11,%r12), %r10d -; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 -; WIN64-NEXT: subl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d -; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %r9d -; WIN64-NEXT: subl %r15d, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: leal (%r9,%r10), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r10d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r11,%r12), %r9d +; WIN64-NEXT: movl %r11d, %r10d +; WIN64-NEXT: subl %r12d, %r10d +; WIN64-NEXT: imull %ebp, %r10d +; WIN64-NEXT: addl %r8d, %r10d +; WIN64-NEXT: leal (%r14,%r15), %r8d +; WIN64-NEXT: movl %r14d, %r11d +; WIN64-NEXT: subl %r15d, %r11d +; WIN64-NEXT: imull %esi, %r11d +; WIN64-NEXT: addl %r10d, %r11d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r10d -; WIN64-NEXT: addl %r10d, %eax -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: imull %edx, %r8d +; WIN64-NEXT: addl %r8d, %eax +; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -297,35 +298,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX-NEXT: subl %edi, %edx -; LINUXOSX-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX-NEXT: movl %edx, %r11d +; LINUXOSX-NEXT: subl %edi, %r11d +; LINUXOSX-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX-NEXT: subl %r12d, %r9d -; LINUXOSX-NEXT: movl %eax, %r11d -; LINUXOSX-NEXT: subl %ecx, %r11d -; LINUXOSX-NEXT: imull %r11d, %r9d -; LINUXOSX-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX-NEXT: leal (%r9,%r12), %edi +; LINUXOSX-NEXT: movl %r9d, %r8d +; LINUXOSX-NEXT: subl %r12d, %r8d +; LINUXOSX-NEXT: movl %eax, %r9d +; LINUXOSX-NEXT: subl %ecx, %r9d +; LINUXOSX-NEXT: imull %r9d, %r8d +; LINUXOSX-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %edx, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX-NEXT: addl %r9d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r9d -; LINUXOSX-NEXT: subl %edx, %r9d -; LINUXOSX-NEXT: imull %esi, %r9d -; LINUXOSX-NEXT: addl %r12d, %r9d +; LINUXOSX-NEXT: imull %r11d, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX-NEXT: addl %r8d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r8d +; LINUXOSX-NEXT: subl %r11d, %r8d +; LINUXOSX-NEXT: imull %esi, %r8d +; LINUXOSX-NEXT: addl %r12d, %r8d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %r8d, %eax -; LINUXOSX-NEXT: imull %r10d, %r11d -; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r15d, %edx -; LINUXOSX-NEXT: imull %edi, %edx -; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: imull %edi, %eax +; LINUXOSX-NEXT: imull %r10d, %r9d ; LINUXOSX-NEXT: addl %r9d, %eax +; LINUXOSX-NEXT: addl %r15d, %r11d +; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r8d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index 80eaf0f900066..c8df7a233d7e3 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -77,35 +77,35 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, ; WIN32-NEXT: movaps %xmm7, (%esp) # 16-byte Spill ; WIN32-NEXT: movaps %xmm6, %xmm7 ; WIN32-NEXT: movaps %xmm5, %xmm6 -; WIN32-NEXT: movaps %xmm3, %xmm5 -; WIN32-NEXT: movaps %xmm2, %xmm3 -; WIN32-NEXT: movaps %xmm1, %xmm2 +; WIN32-NEXT: movaps %xmm4, %xmm5 +; WIN32-NEXT: movaps %xmm1, %xmm4 ; WIN32-NEXT: movaps %xmm0, %xmm1 -; WIN32-NEXT: addps %xmm4, %xmm0 -; WIN32-NEXT: mulps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm0 +; WIN32-NEXT: mulps %xmm5, %xmm1 ; WIN32-NEXT: subps %xmm1, %xmm0 ; WIN32-NEXT: movups 8(%ebp), %xmm1 ; WIN32-NEXT: addps %xmm1, %xmm0 +; WIN32-NEXT: movaps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: mulps %xmm6, %xmm4 +; WIN32-NEXT: subps %xmm4, %xmm1 +; WIN32-NEXT: movups 24(%ebp), %xmm4 +; WIN32-NEXT: addps %xmm4, %xmm1 ; WIN32-NEXT: movaps %xmm2, %xmm4 -; WIN32-NEXT: addps %xmm6, %xmm4 -; WIN32-NEXT: mulps %xmm6, %xmm2 +; WIN32-NEXT: addps %xmm7, %xmm4 +; WIN32-NEXT: mulps %xmm7, %xmm2 ; WIN32-NEXT: subps %xmm2, %xmm4 -; WIN32-NEXT: movups 24(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm4 -; WIN32-NEXT: movaps %xmm3, %xmm2 -; WIN32-NEXT: addps %xmm7, %xmm2 -; WIN32-NEXT: mulps %xmm7, %xmm3 -; WIN32-NEXT: subps %xmm3, %xmm2 -; WIN32-NEXT: movups 40(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm2 +; WIN32-NEXT: movups 40(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm4 +; WIN32-NEXT: movaps %xmm3, %xmm5 +; WIN32-NEXT: movaps (%esp), %xmm2 # 16-byte Reload +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: mulps %xmm2, %xmm3 +; WIN32-NEXT: subps %xmm3, %xmm5 +; WIN32-NEXT: movups 56(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: movaps %xmm4, %xmm2 ; WIN32-NEXT: movaps %xmm5, %xmm3 -; WIN32-NEXT: movaps (%esp), %xmm1 # 16-byte Reload -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: mulps %xmm1, %xmm5 -; WIN32-NEXT: subps %xmm5, %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: movaps %xmm4, %xmm1 ; WIN32-NEXT: movl %ebp, %esp ; WIN32-NEXT: popl %ebp ; WIN32-NEXT: retl @@ -197,44 +197,43 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebp ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $8, %esp -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: leal (%esi,%eax), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: leal (%eax,%esi), %ecx ; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %esi, %ecx -; WIN32-NEXT: subl %eax, %ecx +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: subl %esi, %ebx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: subl %edx, %eax -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: imull %eax, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %ecx, %eax -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: imull %eax, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: addl %ebp, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl %ebp, %ebx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: imull %ebx, %eax +; WIN32-NEXT: addl %esi, %eax ; WIN32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %eax, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: imull %esi, %edi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %ebp, %edx +; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull %ebp, %ecx ; WIN32-NEXT: addl %ecx, %edi +; WIN32-NEXT: addl %eax, %edi ; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %ebx @@ -243,6 +242,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -253,36 +253,37 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r11), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r11d, %r9d -; WIN64-NEXT: movl %eax, %r11d -; WIN64-NEXT: subl %ecx, %r11d -; WIN64-NEXT: imull %r11d, %r9d -; WIN64-NEXT: leal (%r12,%r14), %r11d -; WIN64-NEXT: # kill: def $r12d killed $r12d killed $r12 -; WIN64-NEXT: subl %r14d, %r12d -; WIN64-NEXT: imull %edx, %r12d -; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %edx -; WIN64-NEXT: addl %r9d, %r12d -; WIN64-NEXT: movl %r15d, %r9d -; WIN64-NEXT: subl %edx, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r12d, %r9d +; WIN64-NEXT: leal (%r9,%r11), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r11d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r12,%r14), %r9d +; WIN64-NEXT: movl %r12d, %r11d +; WIN64-NEXT: subl %r14d, %r11d +; WIN64-NEXT: imull %ebp, %r11d +; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; WIN64-NEXT: addl %r8d, %r11d +; WIN64-NEXT: movl %r15d, %r8d +; WIN64-NEXT: subl %r14d, %r8d +; WIN64-NEXT: imull %esi, %r8d +; WIN64-NEXT: addl %r11d, %r8d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r11d -; WIN64-NEXT: addl %r11d, %eax -; WIN64-NEXT: addl %r15d, %edx -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: addl %r15d, %r14d +; WIN64-NEXT: imull %edx, %r14d +; WIN64-NEXT: addl %r14d, %eax +; WIN64-NEXT: addl %r8d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -296,35 +297,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX-NEXT: subl %edi, %edx -; LINUXOSX-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX-NEXT: movl %edx, %r11d +; LINUXOSX-NEXT: subl %edi, %r11d +; LINUXOSX-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX-NEXT: subl %r12d, %r9d -; LINUXOSX-NEXT: movl %eax, %r11d -; LINUXOSX-NEXT: subl %ecx, %r11d -; LINUXOSX-NEXT: imull %r11d, %r9d -; LINUXOSX-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX-NEXT: leal (%r9,%r12), %edi +; LINUXOSX-NEXT: movl %r9d, %r8d +; LINUXOSX-NEXT: subl %r12d, %r8d +; LINUXOSX-NEXT: movl %eax, %r9d +; LINUXOSX-NEXT: subl %ecx, %r9d +; LINUXOSX-NEXT: imull %r9d, %r8d +; LINUXOSX-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %edx, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX-NEXT: addl %r9d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r9d -; LINUXOSX-NEXT: subl %edx, %r9d -; LINUXOSX-NEXT: imull %esi, %r9d -; LINUXOSX-NEXT: addl %r12d, %r9d +; LINUXOSX-NEXT: imull %r11d, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX-NEXT: addl %r8d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r8d +; LINUXOSX-NEXT: subl %r11d, %r8d +; LINUXOSX-NEXT: imull %esi, %r8d +; LINUXOSX-NEXT: addl %r12d, %r8d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %r8d, %eax -; LINUXOSX-NEXT: imull %r10d, %r11d -; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r15d, %edx -; LINUXOSX-NEXT: imull %edi, %edx -; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: imull %edi, %eax +; LINUXOSX-NEXT: imull %r10d, %r9d ; LINUXOSX-NEXT: addl %r9d, %eax +; LINUXOSX-NEXT: addl %r15d, %r11d +; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r8d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index c9c62343fb61e..f91758b861b4c 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -71,21 +71,20 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: subl $20, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %eax, %esi -; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl %cl, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %ebx, %eax @@ -100,46 +99,46 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sarl %cl, %edi ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sarl $31, %esi +; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: shrdl %cl, %edi, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: shrdl %cl, %ebx, %edi ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %edi, %edx +; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: xorl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edi -; X86-NEXT: cmovel (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: notl %esi +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebp, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %ebp, %edx ; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ebp ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: cmovel %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -157,41 +156,41 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec_v4i32: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,1,4,5,6,7] ; X64-NEXT: pslld $23, %xmm1 ; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: cvttps2dq %xmm1, %xmm5 +; X64-NEXT: cvttps2dq %xmm1, %xmm6 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: pmuludq %xmm5, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; X64-NEXT: pmuludq %xmm6, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-NEXT: pmuludq %xmm7, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] -; X64-NEXT: movdqa %xmm6, %xmm7 -; X64-NEXT: psrad %xmm5, %xmm7 -; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X64-NEXT: movdqa %xmm1, %xmm5 -; X64-NEXT: psrad %xmm2, %xmm5 -; X64-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: psrad %xmm3, %xmm2 -; X64-NEXT: psrad %xmm4, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-NEXT: pmuludq %xmm7, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; X64-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; X64-NEXT: movdqa %xmm2, %xmm7 +; X64-NEXT: psrad %xmm6, %xmm7 +; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; X64-NEXT: movdqa %xmm1, %xmm6 +; X64-NEXT: psrad %xmm3, %xmm6 +; X64-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: psrad %xmm4, %xmm3 +; X64-NEXT: psrad %xmm5, %xmm1 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; X64-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-NEXT: pand %xmm1, %xmm6 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: por %xmm3, %xmm0 ; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm6, %xmm1 +; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -215,18 +214,18 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl %cl, %edi +; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testl %edx, %edx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: cmpl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %ebp diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll index d43c0b93136a5..787a33aa49b20 100644 --- a/llvm/test/CodeGen/X86/statepoint-live-in.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll @@ -449,7 +449,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl %edx, %r14d ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r8d, %r15d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d @@ -464,10 +464,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload @@ -476,25 +476,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: addq %r12, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %rbx, %rbp +; CHECK-NEXT: addq %r12, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll index 60a3d94ab23b1..5c26e29dce45e 100644 --- a/llvm/test/CodeGen/X86/statepoint-regs.ll +++ b/llvm/test/CodeGen/X86/statepoint-regs.ll @@ -561,7 +561,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl %edx, %r14d ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r8d, %r15d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d @@ -576,10 +576,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload @@ -588,25 +588,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: addq %r12, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %rbx, %rbp +; CHECK-NEXT: addq %r12, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 7d1a6171c844a..870912bb6bb1b 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -1110,15 +1110,15 @@ entry: define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_index_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X86-NEXT: setb %bl -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: popl %ebx +; X86-NEXT: setb %al +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_index_flag: @@ -1140,13 +1140,13 @@ entry: define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 -; X86-NEXT: setb %dl -; X86-NEXT: movdqa %xmm0, (%ecx) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: setb %al +; X86-NEXT: movdqa %xmm0, (%edx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_flag: diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 1f9153d662019..e0f438eb7cc8f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -5163,39 +5163,39 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %edi, %r9d -; SCALAR-NEXT: movq %rcx, %r11 -; SCALAR-NEXT: shrq $40, %r11 +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $8, %r11d +; SCALAR-NEXT: orl %edi, %r11d +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $40, %r9 ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %edi +; SCALAR-NEXT: movzwl %r11w, %edi ; SCALAR-NEXT: orl %r10d, %edi -; SCALAR-NEXT: movq %rcx, %r9 -; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: shrq $56, %r10 ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rdi ; SCALAR-NEXT: movq %rcx, %r8 ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movq %rcx, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movl %ecx, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r10w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: movzwl %r9w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d ; SCALAR-NEXT: movl %ecx, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b @@ -5209,39 +5209,39 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %ecx, %r9d -; SCALAR-NEXT: movq %rax, %r11 -; SCALAR-NEXT: shrq $40, %r11 +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $8, %r11d +; SCALAR-NEXT: orl %ecx, %r11d +; SCALAR-NEXT: movq %rax, %r9 +; SCALAR-NEXT: shrq $40, %r9 ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %ecx +; SCALAR-NEXT: movzwl %r11w, %ecx ; SCALAR-NEXT: orl %r10d, %ecx -; SCALAR-NEXT: movq %rax, %r9 -; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: movq %rax, %r10 +; SCALAR-NEXT: shrq $56, %r10 ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rcx ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movl %eax, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r10w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: movzwl %r9w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b @@ -7455,9 +7455,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill @@ -7474,36 +7474,36 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 19(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 20(%rdi), %r11d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 21(%rdi), %eax +; SCALAR-NEXT: movzbl 20(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 22(%rdi), %ebp +; SCALAR-NEXT: movzbl 21(%rdi), %ebp ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 22(%rdi), %ebx +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 23(%rdi), %r10d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 24(%rdi), %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 25(%rdi), %r14d +; SCALAR-NEXT: movzbl 25(%rdi), %ecx +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 26(%rdi), %r14d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 26(%rdi), %r15d +; SCALAR-NEXT: movzbl 27(%rdi), %r15d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 27(%rdi), %r12d +; SCALAR-NEXT: movzbl 28(%rdi), %r12d ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 28(%rdi), %r13d +; SCALAR-NEXT: movzbl 29(%rdi), %r13d ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 29(%rdi), %ecx -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 30(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -7512,46 +7512,47 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movb %dil, 31(%rsi) ; SCALAR-NEXT: movb %al, 30(%rsi) -; SCALAR-NEXT: movb %cl, 29(%rsi) -; SCALAR-NEXT: movb %r13b, 28(%rsi) -; SCALAR-NEXT: movb %r12b, 27(%rsi) -; SCALAR-NEXT: movb %r15b, 26(%rsi) -; SCALAR-NEXT: movb %r14b, 25(%rsi) +; SCALAR-NEXT: movb %r13b, 29(%rsi) +; SCALAR-NEXT: movb %r12b, 28(%rsi) +; SCALAR-NEXT: movb %r15b, 27(%rsi) +; SCALAR-NEXT: movb %r14b, 26(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) ; SCALAR-NEXT: movb %r9b, 24(%rsi) ; SCALAR-NEXT: movb %r10b, 23(%rsi) -; SCALAR-NEXT: movb %bpl, 22(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 22(%rsi) ; SCALAR-NEXT: movb %bpl, 21(%rsi) -; SCALAR-NEXT: movb %r11b, 20(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 20(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 19(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 18(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 17(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 16(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 16(%rsi) ; SCALAR-NEXT: movb %r8b, 15(%rsi) ; SCALAR-NEXT: movl %r8d, %r14d -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 14(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 13(%rsi) +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 13(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 12(%rsi) -; SCALAR-NEXT: movb %bl, 11(%rsi) +; SCALAR-NEXT: movb %r11b, 11(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 10(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 9(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 8(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, 7(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 10(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 9(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 8(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 7(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 6(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 5(%rsi) +; SCALAR-NEXT: movb %r13b, 6(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 5(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r12b, 4(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload @@ -7560,8 +7561,8 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, 2(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r8b, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, (%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload @@ -7582,84 +7583,83 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %sil, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 22(%rdx) -; SCALAR-NEXT: movb %bpl, 21(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 20(%rdx) +; SCALAR-NEXT: movb %sil, 21(%rdx) +; SCALAR-NEXT: movb %bpl, 20(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 19(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 18(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 17(%rdx) -; SCALAR-NEXT: movb %r11b, 16(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 17(%rdx) +; SCALAR-NEXT: movb %cl, 16(%rdx) ; SCALAR-NEXT: movb %r14b, 15(%rdx) -; SCALAR-NEXT: movb %r10b, 14(%rdx) -; SCALAR-NEXT: movb %dil, 13(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 12(%rdx) +; SCALAR-NEXT: movb %bl, 14(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 13(%rdx) +; SCALAR-NEXT: movb %al, 12(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 11(%rdx) -; SCALAR-NEXT: movb %r13b, 10(%rdx) -; SCALAR-NEXT: movb %cl, 9(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 8(%rdx) -; SCALAR-NEXT: movb %al, 7(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 6(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 5(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 10(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 9(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 8(%rdx) +; SCALAR-NEXT: movb %r11b, 7(%rdx) +; SCALAR-NEXT: movb %r13b, 6(%rdx) +; SCALAR-NEXT: movb %r10b, 5(%rdx) ; SCALAR-NEXT: movb %r12b, 4(%rdx) ; SCALAR-NEXT: movb %r9b, 3(%rdx) ; SCALAR-NEXT: movb %r15b, 2(%rdx) ; SCALAR-NEXT: movb %r8b, 1(%rdx) -; SCALAR-NEXT: movb %bl, (%rdx) -; SCALAR-NEXT: movl %ebx, %edi -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 62(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 61(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 60(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 59(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 58(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 57(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 56(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 55(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 54(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 53(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 52(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 51(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 50(%rdx) -; SCALAR-NEXT: movb %bpl, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 48(%rdx) -; SCALAR-NEXT: movb %r14b, 47(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 46(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 45(%rdx) -; SCALAR-NEXT: movb %r11b, 44(%rdx) +; SCALAR-NEXT: movb %dil, (%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 63(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 62(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 60(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 59(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 58(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 57(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 56(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 55(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 54(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 53(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 52(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 51(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 50(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 49(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 48(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 46(%rdx) +; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 44(%rdx) ; SCALAR-NEXT: movb %sil, 43(%rdx) -; SCALAR-NEXT: movb %r13b, 42(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 41(%rdx) -; SCALAR-NEXT: movb %r10b, 40(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 39(%rdx) -; SCALAR-NEXT: movb %cl, 38(%rdx) -; SCALAR-NEXT: movb %al, 37(%rdx) +; SCALAR-NEXT: movb %bl, 42(%rdx) +; SCALAR-NEXT: movb %r14b, 41(%rdx) +; SCALAR-NEXT: movb %bpl, 40(%rdx) +; SCALAR-NEXT: movb %r11b, 39(%rdx) +; SCALAR-NEXT: movb %r13b, 38(%rdx) +; SCALAR-NEXT: movb %r10b, 37(%rdx) ; SCALAR-NEXT: movb %r12b, 36(%rdx) ; SCALAR-NEXT: movb %r9b, 35(%rdx) ; SCALAR-NEXT: movb %r15b, 34(%rdx) diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 41d2b1e1939cc..6d77e04504e2d 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -1708,11 +1708,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: .cfi_offset %edi, -16 ; CHECK-i386-NEXT: .cfi_offset %ebx, -12 ; CHECK-i386-NEXT: .cfi_offset %ebp, -8 +; CHECK-i386-NEXT: movl 148(%esp), %esi ; CHECK-i386-NEXT: movl $0, 64(%esp) -; CHECK-i386-NEXT: movl 188(%esp), %ebp ; CHECK-i386-NEXT: movl 192(%esp), %ebx -; CHECK-i386-NEXT: movl 196(%esp), %edi -; CHECK-i386-NEXT: movl 200(%esp), %esi +; CHECK-i386-NEXT: movl 196(%esp), %ebp +; CHECK-i386-NEXT: movl 200(%esp), %edi ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) ; CHECK-i386-NEXT: movl $0, 48(%esp) @@ -1729,10 +1729,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $0, 4(%esp) ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 -; CHECK-i386-NEXT: movl %esi, 56(%esp) -; CHECK-i386-NEXT: movl %edi, 52(%esp) +; CHECK-i386-NEXT: movl %edi, 56(%esp) +; CHECK-i386-NEXT: movl %ebp, 52(%esp) ; CHECK-i386-NEXT: movl %ebx, 48(%esp) -; CHECK-i386-NEXT: movl %ebp, 44(%esp) +; CHECK-i386-NEXT: movl 188(%esp), %eax +; CHECK-i386-NEXT: movl %eax, 44(%esp) ; CHECK-i386-NEXT: movl 184(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 40(%esp) ; CHECK-i386-NEXT: movl 180(%esp), %eax @@ -1751,8 +1752,7 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, 12(%esp) ; CHECK-i386-NEXT: movl 152(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 8(%esp) -; CHECK-i386-NEXT: movl 148(%esp), %eax -; CHECK-i386-NEXT: movl %eax, 4(%esp) +; CHECK-i386-NEXT: movl %esi, 4(%esp) ; CHECK-i386-NEXT: leal 88(%esp), %eax ; CHECK-i386-NEXT: movl %eax, (%esp) ; CHECK-i386-NEXT: calll _params_and_return_in_reg2 @@ -1767,8 +1767,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-i386-NEXT: movl 104(%esp), %ebp ; CHECK-i386-NEXT: movl 108(%esp), %edi -; CHECK-i386-NEXT: movl 112(%esp), %ebx -; CHECK-i386-NEXT: movl 116(%esp), %esi +; CHECK-i386-NEXT: movl 112(%esp), %esi +; CHECK-i386-NEXT: movl 116(%esp), %ebx ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) ; CHECK-i386-NEXT: movl $0, 48(%esp) @@ -1786,8 +1786,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 ; CHECK-i386-NEXT: movl 144(%esp), %eax -; CHECK-i386-NEXT: movl %esi, 28(%eax) -; CHECK-i386-NEXT: movl %ebx, 24(%eax) +; CHECK-i386-NEXT: movl %ebx, 28(%eax) +; CHECK-i386-NEXT: movl %esi, 24(%eax) ; CHECK-i386-NEXT: movl %edi, 20(%eax) ; CHECK-i386-NEXT: movl %ebp, 16(%eax) ; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index e6b39eb8b9522..f0479aea1b82c 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -286,13 +286,13 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl $1, %eax ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %edx, %edx ; X86-NEXT: movl $1, %edi ; X86-NEXT: cmovnel %eax, %edi ; X86-NEXT: cmovel %ebx, %edi @@ -300,17 +300,17 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: negl %ebp ; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl $1, %ebp ; X86-NEXT: cmovbl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %esi, %ebx -; X86-NEXT: orl %edx, %eax +; X86-NEXT: cmovbl %edx, %ebx +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %esi, %ebx +; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl %ebx, 4(%eax) ; X86-NEXT: movl %ebp, (%eax) ; X86-NEXT: popl %esi @@ -367,29 +367,29 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %edi, %edx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmovbl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: cmpl %ecx, %ebp -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: cmovbl %ecx, %ebp -; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmovbl %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovbl %ebp, %esi +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: cmovbl %edx, %edi +; X86-NEXT: cmovbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -715,26 +715,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmoval %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmoval %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmoval %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1316,17 +1316,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: cmovbl %esi, %edi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll index e1538aaaeba65..e4ce08966a894 100644 --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -397,26 +397,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovbl %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovbl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -731,17 +731,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: cmovbl %esi, %edi +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll index f5248d8679717..ccabb360a990c 100644 --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -93,426 +93,417 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl %eax, %edx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ecx +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 4(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 16(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 20(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 28(%ecx) +; X86-NEXT: movl %eax, 32(%ecx) +; X86-NEXT: andl $4095, %ebx # imm = 0xFFF +; X86-NEXT: movw %bx, 36(%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 16(%edx) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 28(%edx) -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: andl $4095, %ecx # imm = 0xFFF -; X86-NEXT: movw %cx, 36(%edx) -; X86-NEXT: movl %edx, %eax ; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -532,57 +523,57 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: movq %r8, %r11 ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %r10, %rbp +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r15, %rbx -; X64-NEXT: adcq %r14, %r12 +; X64-NEXT: adcq %r14, %rbp ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %r13 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: adcq %r10, %r12 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r12, %r10 +; X64-NEXT: addq %r15, %r10 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 ; X64-NEXT: adcq %rbp, %rdx ; X64-NEXT: imulq %r9, %r11 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: addq %r13, %r14 -; X64-NEXT: adcq %r15, %r12 +; X64-NEXT: adcq %r12, %r15 ; X64-NEXT: adcq %rdx, %r11 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 @@ -596,8 +587,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: adcq %r13, %rdx ; X64-NEXT: imulq %r10, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: addq %r14, %r15 -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: addq %r14, %r12 +; X64-NEXT: adcq %r15, %rax ; X64-NEXT: adcq %r11, %rcx ; X64-NEXT: imulq %r9, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload @@ -609,7 +600,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %r15, 16(%rdi) +; X64-NEXT: movq %r12, 16(%rdi) ; X64-NEXT: movq %rax, 24(%rdi) ; X64-NEXT: movl %esi, 32(%rdi) ; X64-NEXT: shrq $32, %rsi diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index cb4bdd1ede75c..eacc714b49a4d 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -266,20 +266,20 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx @@ -359,22 +359,22 @@ define i64 @func9(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %edi, %edx diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 33f43c75cad3d..6b6845147e043 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,31 +52,31 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: shrl $2, %edx -; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: cmovel %eax, %edx @@ -441,30 +441,30 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: negl %esi +; X86-NEXT: negl %edi ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: orl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 9800c116ea15f..82603b35ba712 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -44,87 +44,89 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %ebp +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto (%esp) # 1-byte Folded Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto %bh +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %edi +; X86-NEXT: leal (%ecx,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: leal (%esi,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto %bl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %ch ; X86-NEXT: andb %cl, %ch -; X86-NEXT: orb (%esp), %bh # 1-byte Folded Reload -; X86-NEXT: orb %ch, %bh -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; X86-NEXT: movb %bh, (%esp) # 1-byte Spill -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: orb %ch, %cl +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %ch ; X86-NEXT: andb %cl, %ch +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: orb %ch, %bl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload @@ -141,7 +143,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: setne %al ; X86-NEXT: andb %bh, %al ; X86-NEXT: orb %bl, %al -; X86-NEXT: orb (%esp), %al # 1-byte Folded Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: andb $1, %al ; X86-NEXT: movb %al, 16(%ecx) diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index 6f639880dc574..b1194bedc4e1c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -1210,21 +1210,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi ; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx ; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi ; CHECK-BASELINE-NEXT: movzbl (%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx -; CHECK-BASELINE-NEXT: movzbl (%rsi), %esi -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb (%r10), %sil -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb (%r10), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: andb 1(%r10), %al @@ -1241,34 +1241,34 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: xorb %dl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 4(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: andb 4(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: andb 5(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: andb 6(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: andb 7(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: andb 8(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 9(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1357,10 +1357,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b ; CHECK-BASELINE-NEXT: xorb %al, %r14b ; CHECK-BASELINE-NEXT: movzbl 25(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 25(%r10), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebp +; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl +; CHECK-BASELINE-NEXT: xorb %al, %bpl ; CHECK-BASELINE-NEXT: movzbl 26(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r9), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil @@ -1381,11 +1381,11 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 29(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebp +; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebx ; CHECK-BASELINE-NEXT: movzbl 30(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: andb 30(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: movzbl 31(%r8), %r8d ; CHECK-BASELINE-NEXT: movzbl 31(%r9), %r9d ; CHECK-BASELINE-NEXT: xorb %r8b, %r9b @@ -1397,7 +1397,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %dl, 28(%r11) ; CHECK-BASELINE-NEXT: movb %sil, 27(%r11) ; CHECK-BASELINE-NEXT: movb %dil, 26(%r11) -; CHECK-BASELINE-NEXT: movb %bl, 25(%r11) +; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11) ; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11) ; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11) ; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11) @@ -1477,21 +1477,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebx -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r14d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r15d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r12d -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi ; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx ; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi ; CHECK-SSE1-NEXT: movzbl (%r8), %eax ; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx -; CHECK-SSE1-NEXT: movzbl (%rsi), %esi -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb (%r10), %sil -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl (%r9), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb (%r10), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: andb 1(%r10), %al @@ -1508,34 +1508,34 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: xorb %dl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 4(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: andb 4(%r10), %al -; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: andb 5(%r10), %al -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: andb 6(%r10), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: andb 7(%r10), %al -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: andb 8(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 9(%r10), %al -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1624,10 +1624,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andb 24(%r10), %r14b ; CHECK-SSE1-NEXT: xorb %al, %r14b ; CHECK-SSE1-NEXT: movzbl 25(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 25(%r10), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebp +; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: andb 25(%r10), %bpl +; CHECK-SSE1-NEXT: xorb %al, %bpl ; CHECK-SSE1-NEXT: movzbl 26(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r9), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil @@ -1648,11 +1648,11 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 29(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebp +; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebx ; CHECK-SSE1-NEXT: movzbl 30(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: andb 30(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: movzbl 31(%r8), %r8d ; CHECK-SSE1-NEXT: movzbl 31(%r9), %r9d ; CHECK-SSE1-NEXT: xorb %r8b, %r9b @@ -1664,7 +1664,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %dl, 28(%r11) ; CHECK-SSE1-NEXT: movb %sil, 27(%r11) ; CHECK-SSE1-NEXT: movb %dil, 26(%r11) -; CHECK-SSE1-NEXT: movb %bl, 25(%r11) +; CHECK-SSE1-NEXT: movb %bpl, 25(%r11) ; CHECK-SSE1-NEXT: movb %r14b, 24(%r11) ; CHECK-SSE1-NEXT: movb %r15b, 23(%r11) ; CHECK-SSE1-NEXT: movb %r12b, 22(%r11) @@ -3231,10 +3231,12 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rdx, %r13 -; CHECK-BASELINE-NEXT: movq %rsi, %r12 -; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r15d +; CHECK-BASELINE-NEXT: movq %rcx, %r12 +; CHECK-BASELINE-NEXT: movq %rdx, %r15 +; CHECK-BASELINE-NEXT: movq %rsi, %r14 +; CHECK-BASELINE-NEXT: movq %rdi, %r13 +; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax @@ -3248,197 +3250,199 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d ; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d ; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %edi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl (%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 1(%r13), %ebx -; CHECK-BASELINE-NEXT: movzbl (%r12), %r14d -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb (%rcx), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: andb 1(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: andb 2(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d +; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: andb (%r12), %bl +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: andb 2(%r12), %dl +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 3(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 3(%rcx), %al +; CHECK-BASELINE-NEXT: andb 4(%r12), %al ; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 4(%rcx), %al +; CHECK-BASELINE-NEXT: andb 5(%r12), %al ; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 5(%rcx), %al +; CHECK-BASELINE-NEXT: andb 6(%r12), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %r11b, %al -; CHECK-BASELINE-NEXT: andb 6(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r11b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 7(%rcx), %al +; CHECK-BASELINE-NEXT: andb 7(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 8(%rcx), %al +; CHECK-BASELINE-NEXT: andb 8(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 9(%rcx), %al +; CHECK-BASELINE-NEXT: andb 9(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r8b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%r12), %edx +; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 10(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 10(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 11(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 11(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 12(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 12(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 13(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 13(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 14(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: andb 15(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 16(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 17(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 17(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 18(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 18(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 19(%r12), %r15d -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: andb 19(%rcx), %r15b -; CHECK-BASELINE-NEXT: movq %rcx, %rdx -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 20(%r12), %r14d -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb 20(%rcx), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r12), %ebp +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 14(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 15(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 16(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 17(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 18(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 19(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 20(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 21(%rcx), %bpl +; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r12), %ebx +; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx ; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 22(%rcx), %bl +; CHECK-BASELINE-NEXT: andb 22(%r12), %bl ; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movzbl 23(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 23(%r12), %r11d +; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%rcx), %r11b +; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 24(%r12), %r10d -; CHECK-BASELINE-NEXT: xorb %al, %r10b -; CHECK-BASELINE-NEXT: andb 24(%rcx), %r10b -; CHECK-BASELINE-NEXT: xorb %al, %r10b -; CHECK-BASELINE-NEXT: movzbl 25(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r12), %r9d +; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 25(%rcx), %r9b +; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 26(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 26(%r12), %r8d +; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 26(%rcx), %r8b +; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 27(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 27(%r12), %edi +; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 27(%rcx), %dil +; CHECK-BASELINE-NEXT: andb 26(%r12), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 28(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 28(%r12), %esi +; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 28(%rcx), %sil +; CHECK-BASELINE-NEXT: andb 27(%r12), %sil ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 29(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 29(%r12), %ecx +; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb 28(%r12), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%rdx), %cl +; CHECK-BASELINE-NEXT: andb 29(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r13), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 30(%r12), %eax -; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: andb 30(%rdx), %al -; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movzbl 31(%r13), %r13d -; CHECK-BASELINE-NEXT: movzbl 31(%r12), %r12d -; CHECK-BASELINE-NEXT: xorb %r13b, %r12b -; CHECK-BASELINE-NEXT: andb 31(%rdx), %r12b -; CHECK-BASELINE-NEXT: xorb %r13b, %r12b -; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-BASELINE-NEXT: movb %r12b, 31(%r13) +; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: andb 30(%r12), %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) ; CHECK-BASELINE-NEXT: movb %al, 30(%r13) ; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %sil, 28(%r13) -; CHECK-BASELINE-NEXT: movb %dil, 27(%r13) -; CHECK-BASELINE-NEXT: movb %r8b, 26(%r13) -; CHECK-BASELINE-NEXT: movb %r9b, 25(%r13) -; CHECK-BASELINE-NEXT: movb %r10b, 24(%r13) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) +; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) +; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) ; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) ; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) ; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) -; CHECK-BASELINE-NEXT: movb %r14b, 20(%r13) -; CHECK-BASELINE-NEXT: movb %r15b, 19(%r13) +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: movb %al, 20(%r13) +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: movb %al, 19(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: movb %al, 18(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload @@ -3494,10 +3498,12 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rdx, %r13 -; CHECK-SSE1-NEXT: movq %rsi, %r12 -; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r15d +; CHECK-SSE1-NEXT: movq %rcx, %r12 +; CHECK-SSE1-NEXT: movq %rdx, %r15 +; CHECK-SSE1-NEXT: movq %rsi, %r14 +; CHECK-SSE1-NEXT: movq %rdi, %r13 +; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax @@ -3511,197 +3517,199 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d ; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d ; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %edi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 2(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl (%r13), %eax -; CHECK-SSE1-NEXT: movzbl 1(%r13), %ebx -; CHECK-SSE1-NEXT: movzbl (%r12), %r14d -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb (%rcx), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r12), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: andb 1(%rcx), %al -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r12), %eax -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: andb 2(%rcx), %al -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx +; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d +; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl (%r14), %ebx +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: andb (%r12), %bl +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: andb 1(%r12), %r11b +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: andb 2(%r12), %dl +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 3(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax ; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 3(%rcx), %al +; CHECK-SSE1-NEXT: andb 4(%r12), %al ; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax ; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 4(%rcx), %al +; CHECK-SSE1-NEXT: andb 5(%r12), %al ; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 5(%rcx), %al +; CHECK-SSE1-NEXT: andb 6(%r12), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r12), %eax -; CHECK-SSE1-NEXT: xorb %r11b, %al -; CHECK-SSE1-NEXT: andb 6(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r11b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 7(%rcx), %al +; CHECK-SSE1-NEXT: andb 7(%r12), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 8(%rcx), %al +; CHECK-SSE1-NEXT: andb 8(%r12), %al ; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 9(%rcx), %al +; CHECK-SSE1-NEXT: andb 9(%r12), %al ; CHECK-SSE1-NEXT: xorb %r8b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%r12), %edx +; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 10(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 10(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 11(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 11(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 12(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 12(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 13(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 13(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 14(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%r12), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: andb 15(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 16(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 16(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 17(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 17(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 18(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 18(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 19(%r12), %r15d -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: andb 19(%rcx), %r15b -; CHECK-SSE1-NEXT: movq %rcx, %rdx -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 20(%r12), %r14d -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb 20(%rcx), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r12), %ebp +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 14(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 15(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 16(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 17(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 18(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 19(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 20(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 21(%rcx), %bpl +; CHECK-SSE1-NEXT: andb 21(%r12), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r12), %ebx +; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx ; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 22(%rcx), %bl +; CHECK-SSE1-NEXT: andb 22(%r12), %bl ; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movzbl 23(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 23(%r12), %r11d +; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%rcx), %r11b +; CHECK-SSE1-NEXT: andb 23(%r12), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 24(%r12), %r10d -; CHECK-SSE1-NEXT: xorb %al, %r10b -; CHECK-SSE1-NEXT: andb 24(%rcx), %r10b -; CHECK-SSE1-NEXT: xorb %al, %r10b -; CHECK-SSE1-NEXT: movzbl 25(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r12), %r9d +; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 25(%rcx), %r9b +; CHECK-SSE1-NEXT: andb 24(%r12), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 26(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 26(%r12), %r8d +; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 26(%rcx), %r8b +; CHECK-SSE1-NEXT: andb 25(%r12), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 27(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 27(%r12), %edi +; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 27(%rcx), %dil +; CHECK-SSE1-NEXT: andb 26(%r12), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 28(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 28(%r12), %esi +; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 28(%rcx), %sil +; CHECK-SSE1-NEXT: andb 27(%r12), %sil ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 29(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 29(%r12), %ecx +; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb 28(%r12), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%rdx), %cl +; CHECK-SSE1-NEXT: andb 29(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r13), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 30(%r12), %eax -; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-SSE1-NEXT: andb 30(%rdx), %al -; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movzbl 31(%r13), %r13d -; CHECK-SSE1-NEXT: movzbl 31(%r12), %r12d -; CHECK-SSE1-NEXT: xorb %r13b, %r12b -; CHECK-SSE1-NEXT: andb 31(%rdx), %r12b -; CHECK-SSE1-NEXT: xorb %r13b, %r12b -; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-SSE1-NEXT: movb %r12b, 31(%r13) +; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: andb 30(%r12), %al +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: andb 31(%r12), %r14b +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) ; CHECK-SSE1-NEXT: movb %al, 30(%r13) ; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %sil, 28(%r13) -; CHECK-SSE1-NEXT: movb %dil, 27(%r13) -; CHECK-SSE1-NEXT: movb %r8b, 26(%r13) -; CHECK-SSE1-NEXT: movb %r9b, 25(%r13) -; CHECK-SSE1-NEXT: movb %r10b, 24(%r13) +; CHECK-SSE1-NEXT: movb %dl, 28(%r13) +; CHECK-SSE1-NEXT: movb %sil, 27(%r13) +; CHECK-SSE1-NEXT: movb %dil, 26(%r13) +; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) +; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) ; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) ; CHECK-SSE1-NEXT: movb %bl, 22(%r13) ; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) -; CHECK-SSE1-NEXT: movb %r14b, 20(%r13) -; CHECK-SSE1-NEXT: movb %r15b, 19(%r13) +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: movb %al, 20(%r13) +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: movb %al, 19(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: movb %al, 18(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index 8df101852f06b..ebb5e135eacd0 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -45,15 +45,14 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %eax ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %eax, %edx ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -62,31 +61,32 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: shrl %cl, %edi -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovel %edi, %ebx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movb %ch, %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrdl %cl, %ebp, %eax -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrdl %cl, %edx, %ebp -; X86-NEXT: testb $32, {{[0-9]+}}(%esp) +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %edi, %ebp ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -169,7 +169,7 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -178,36 +178,38 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: cmpl %ebp, %ebx +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovnel %edx, %esi ; X86-NEXT: movl $-1, %ebx -; X86-NEXT: cmovnel %ebx, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movb %dl, %cl -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movb %ah, %cl +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnel %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, 12(%ecx) -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovnel %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -330,15 +332,16 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl $65535, %eax # imm = 0xFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp ; X86-NEXT: movzwl %bp, %edx ; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmpw %dx, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: cmovnel %eax, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shll %cl, %ebx ; X86-NEXT: movzwl %bx, %esi @@ -355,26 +358,26 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: cmpw %ax, %dx ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzwl %dx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movzwl %si, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmpw %ax, %dx ; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: cmovnel %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movzwl %ax, %edx +; X86-NEXT: shrl %cl, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %si, %cx +; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %si, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) ; X86-NEXT: movw %bx, 8(%ecx) ; X86-NEXT: movw %bp, 6(%ecx) diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 61e648eec855f..99a3821bb9ba9 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -247,7 +247,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -259,7 +259,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -271,11 +271,11 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -295,18 +295,18 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8: @@ -511,7 +511,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -523,7 +523,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -535,11 +535,11 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -559,18 +559,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: @@ -713,7 +713,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm3 ; SSE3-NEXT: andl $31, %r15d ; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: andl $31, %r14d ; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 @@ -722,7 +722,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm8 ; SSE3-NEXT: andl $31, %r11d ; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: andl $31, %r10d ; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 @@ -731,10 +731,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: andl $31, %r8d ; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: andl $31, %edi ; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: andl $31, %esi ; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 @@ -751,18 +751,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 @@ -845,7 +845,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: andl $31, %r15d ; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $31, %r14d ; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSSE3-NEXT: movd %eax, %xmm7 @@ -854,7 +854,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: andl $31, %r11d ; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $31, %r10d ; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -863,10 +863,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $31, %r8d ; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $31, %edi ; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $31, %esi ; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -883,18 +883,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll index b55fd27d4036c..032ffb0d0bf7d 100644 --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -1093,21 +1093,21 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: andl $63, %esi ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1123,44 +1123,44 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm1 +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 ; AVX512F-NEXT: vmovd %xmm1, %eax @@ -1344,21 +1344,21 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: andl $63, %esi ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1374,44 +1374,44 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm5, %eax +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm5, %xmm1 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 ; AVX512BW-NEXT: vmovd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index a7ac1a19ea024..17c5ff7955106 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -156,15 +156,14 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp) -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -203,10 +202,11 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -234,14 +234,14 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -264,10 +264,10 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -448,16 +448,15 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) @@ -495,10 +494,11 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) @@ -526,14 +526,14 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -556,10 +556,10 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 88e4b11b66f54..b275814cc8033 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -423,148 +423,150 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: psubd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSSE3-NEXT: pand %xmm1, %xmm8 +; SSSE3-NEXT: pand %xmm4, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 ; SSSE3-NEXT: pand %xmm5, %xmm9 ; SSSE3-NEXT: paddd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSSE3-NEXT: psubd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSSE3-NEXT: movdqa %xmm1, (%rcx) -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: paddd %xmm8, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: paddd %xmm8, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSSE3-NEXT: pmuludq %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSSE3-NEXT: pmuludq %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: psubd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm5, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v6i32: @@ -579,25 +581,25 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmuldq %xmm2, %xmm0 ; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: movd %r9d, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pmuldq %xmm3, %xmm4 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm3 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx -; SSE41-NEXT: pinsrd $1, %edx, %xmm5 +; SSE41-NEXT: pinsrd $1, %edx, %xmm3 +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SSE41-NEXT: pinsrd $1, %esi, %xmm5 ; SSE41-NEXT: pmulld %xmm3, %xmm5 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movd %edx, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE41-NEXT: movd %edx, %xmm6 +; SSE41-NEXT: movd %esi, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] ; SSE41-NEXT: pmuldq %xmm3, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] -; SSE41-NEXT: movq %xmm5, 16(%rsi) +; SSE41-NEXT: movq %xmm5, 16(%rcx) ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 @@ -608,7 +610,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] ; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, (%rsi) +; SSE41-NEXT: movdqa %xmm1, (%rcx) ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 @@ -1887,103 +1889,103 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE2-NEXT: pmulhw %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: psrlw $8, %xmm10 -; SSE2-NEXT: packuswb %xmm8, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: packuswb %xmm9, %xmm7 +; SSE2-NEXT: pmulhw %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm8, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: packuswb %xmm10, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE2-NEXT: pcmpeqb %xmm11, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm10, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmulhw %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pmulhw %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: packuswb %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: packuswb %xmm10, %xmm6 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE2-NEXT: pcmpeqb %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmulhw %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm1, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pmulhw %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: packuswb %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: packuswb %xmm10, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE2-NEXT: pcmpeqb %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmulhw %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm8 +; SSE2-NEXT: pmulhw %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: packuswb %xmm10, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqb %xmm11, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm7, 48(%rsi) ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: movdqa %xmm6, 32(%rsi) ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm5, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm4, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm8, (%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm3, 192(%rdi) @@ -1995,20 +1997,20 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, 64(%rdi) -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm8, (%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: movdqa %xmm8, 224(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 240(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -2066,103 +2068,103 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 ; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: pmulhw %xmm10, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: psrlw $8, %xmm10 -; SSSE3-NEXT: packuswb %xmm8, %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm7 -; SSSE3-NEXT: packuswb %xmm9, %xmm7 +; SSSE3-NEXT: pmulhw %xmm9, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm8, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: packuswb %xmm10, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSSE3-NEXT: pcmpeqb %xmm11, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pmulhw %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pmulhw %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: packuswb %xmm2, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: packuswb %xmm10, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSSE3-NEXT: pcmpeqb %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: pmulhw %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm1, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pmulhw %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: packuswb %xmm1, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm5 ; SSSE3-NEXT: packuswb %xmm10, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSSE3-NEXT: pcmpeqb %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pmulhw %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm0, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtb %xmm4, %xmm8 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm8 +; SSSE3-NEXT: pmulhw %xmm11, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm0, %xmm11 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: packuswb %xmm10, %xmm8 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtb %xmm8, %xmm4 +; SSSE3-NEXT: pcmpeqb %xmm11, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm7, 48(%rsi) ; SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm2, %xmm6 ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, (%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) @@ -2174,20 +2176,20 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm8, (%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm8 ; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm8, 224(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -2415,96 +2417,96 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm6 +; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm8 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7 -; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm8 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-NEXT: vpmulhw %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm9 +; AVX1-NEXT: vpmulhw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm9 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm3 ; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX1-NEXT: vpmulhw %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm10 +; AVX1-NEXT: vpmulhw %xmm6, %xmm10, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm10 ; AVX1-NEXT: vpackuswb %xmm11, %xmm10, %xmm10 -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm10 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm9 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackuswb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 +; AVX1-NEXT: vpmulhw %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm11 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 ; AVX1-NEXT: vpackuswb %xmm11, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm9, %xmm0, %xmm9 -; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm10 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm7 +; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT: vpxor %xmm7, %xmm11, %xmm6 -; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm2 -; AVX1-NEXT: vpxor %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm2 +; AVX1-NEXT: vpxor %xmm0, %xmm10, %xmm0 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm1, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm9, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm7, (%rsi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 128(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -3302,18 +3304,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rsi, %r11 ; SSE2-NEXT: movq %rdi, %r10 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: movq %r11, %rdx +; SSE2-NEXT: movq %rsi, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: movq %r9, %rbx ; SSE2-NEXT: imulq %rdx, %rbx ; SSE2-NEXT: movq %r15, %rax ; SSE2-NEXT: mulq %rdx -; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rdx, %rsi ; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: addq %rax, %rdi -; SSE2-NEXT: addq %rbx, %rdi +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: addq %rbx, %rsi ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r13 @@ -3324,11 +3326,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: addq %r13, %rbx ; SSE2-NEXT: addq %rax, %rbx ; SSE2-NEXT: addq %r12, %r14 -; SSE2-NEXT: adcq %rdi, %rbx +; SSE2-NEXT: adcq %rsi, %rbx ; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rdx, %r15 @@ -3359,63 +3361,63 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: setne %r15b ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movq %rbp, %r11 -; SSE2-NEXT: imulq %rdx, %r11 -; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: movq %rbp, %r10 +; SSE2-NEXT: imulq %rdx, %r10 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %rdx ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %rbx ; SSE2-NEXT: addq %rax, %r9 -; SSE2-NEXT: addq %r11, %r9 +; SSE2-NEXT: addq %r10, %r9 ; SSE2-NEXT: movq %rbp, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r14 ; SSE2-NEXT: imulq %rcx, %r14 ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: addq %r14, %rbx -; SSE2-NEXT: addq %rax, %rbx -; SSE2-NEXT: addq %r10, %r11 -; SSE2-NEXT: adcq %r9, %rbx +; SSE2-NEXT: movq %rdx, %r10 +; SSE2-NEXT: addq %r14, %r10 +; SSE2-NEXT: addq %rax, %r10 +; SSE2-NEXT: addq %rbx, %r11 +; SSE2-NEXT: adcq %r9, %r10 ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: movq %rax, %r9 ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %r9, %r14 -; SSE2-NEXT: adcq $0, %rsi +; SSE2-NEXT: addq %rbx, %r14 +; SSE2-NEXT: adcq $0, %rdi ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %r14, %r9 -; SSE2-NEXT: adcq %rsi, %r8 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: addq %r14, %rbx +; SSE2-NEXT: adcq %rdi, %r8 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %al, %edi ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: addq %r8, %rax -; SSE2-NEXT: adcq %rsi, %rdx +; SSE2-NEXT: adcq %rdi, %rdx ; SSE2-NEXT: addq %r11, %rax -; SSE2-NEXT: adcq %rbx, %rdx -; SSE2-NEXT: movq %r9, 24(%r12) -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: xorq %rax, %r9 +; SSE2-NEXT: adcq %r10, %rdx +; SSE2-NEXT: movq %rbx, 24(%r12) +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: xorq %rbx, %rdx +; SSE2-NEXT: xorq %rax, %rbx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %r9 +; SSE2-NEXT: orq %rdx, %rbx ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: negl %r15d ; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %r10, 16(%r12) -; SSE2-NEXT: movq %rdi, (%r12) +; SSE2-NEXT: movq %r9, 16(%r12) +; SSE2-NEXT: movq %rsi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3436,18 +3438,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rsi, %r11 ; SSSE3-NEXT: movq %rdi, %r10 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSSE3-NEXT: movq %r11, %rdx +; SSSE3-NEXT: movq %rsi, %rdx ; SSSE3-NEXT: sarq $63, %rdx ; SSSE3-NEXT: movq %r9, %rbx ; SSSE3-NEXT: imulq %rdx, %rbx ; SSSE3-NEXT: movq %r15, %rax ; SSSE3-NEXT: mulq %rdx -; SSSE3-NEXT: movq %rdx, %rdi +; SSSE3-NEXT: movq %rdx, %rsi ; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: addq %rax, %rdi -; SSSE3-NEXT: addq %rbx, %rdi +; SSSE3-NEXT: addq %rax, %rsi +; SSSE3-NEXT: addq %rbx, %rsi ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r13 @@ -3458,11 +3460,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: addq %r13, %rbx ; SSSE3-NEXT: addq %rax, %rbx ; SSSE3-NEXT: addq %r12, %r14 -; SSSE3-NEXT: adcq %rdi, %rbx +; SSSE3-NEXT: adcq %rsi, %rbx ; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rdx, %r15 @@ -3493,63 +3495,63 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: setne %r15b ; SSSE3-NEXT: movq %rcx, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movq %rbp, %r11 -; SSSE3-NEXT: imulq %rdx, %r11 -; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: movq %rbp, %r10 +; SSSE3-NEXT: imulq %rdx, %r10 +; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %rdx ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, %rbx ; SSSE3-NEXT: addq %rax, %r9 -; SSSE3-NEXT: addq %r11, %r9 +; SSSE3-NEXT: addq %r10, %r9 ; SSSE3-NEXT: movq %rbp, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r14 ; SSSE3-NEXT: imulq %rcx, %r14 ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: addq %r14, %rbx -; SSSE3-NEXT: addq %rax, %rbx -; SSSE3-NEXT: addq %r10, %r11 -; SSSE3-NEXT: adcq %r9, %rbx +; SSSE3-NEXT: movq %rdx, %r10 +; SSSE3-NEXT: addq %r14, %r10 +; SSSE3-NEXT: addq %rax, %r10 +; SSSE3-NEXT: addq %rbx, %r11 +; SSSE3-NEXT: adcq %r9, %r10 ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %rsi -; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: movq %rax, %r9 ; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %rsi -; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %r9, %r14 -; SSSE3-NEXT: adcq $0, %rsi +; SSSE3-NEXT: addq %rbx, %r14 +; SSSE3-NEXT: adcq $0, %rdi ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %r14, %r9 -; SSSE3-NEXT: adcq %rsi, %r8 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: addq %r14, %rbx +; SSSE3-NEXT: adcq %rdi, %r8 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %al, %edi ; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: addq %r8, %rax -; SSSE3-NEXT: adcq %rsi, %rdx +; SSSE3-NEXT: adcq %rdi, %rdx ; SSSE3-NEXT: addq %r11, %rax -; SSSE3-NEXT: adcq %rbx, %rdx -; SSSE3-NEXT: movq %r9, 24(%r12) -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: xorq %r9, %rdx -; SSSE3-NEXT: xorq %rax, %r9 +; SSSE3-NEXT: adcq %r10, %rdx +; SSSE3-NEXT: movq %rbx, 24(%r12) +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: xorq %rbx, %rdx +; SSSE3-NEXT: xorq %rax, %rbx ; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: orq %rdx, %r9 +; SSSE3-NEXT: orq %rdx, %rbx ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: negl %r15d ; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %r10, 16(%r12) -; SSSE3-NEXT: movq %rdi, (%r12) +; SSSE3-NEXT: movq %r9, 16(%r12) +; SSSE3-NEXT: movq %rsi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3570,18 +3572,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rsi, %r11 ; SSE41-NEXT: movq %rdi, %r10 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movq %r11, %rdx +; SSE41-NEXT: movq %rsi, %rdx ; SSE41-NEXT: sarq $63, %rdx ; SSE41-NEXT: movq %r9, %rbx ; SSE41-NEXT: imulq %rdx, %rbx ; SSE41-NEXT: movq %r15, %rax ; SSE41-NEXT: mulq %rdx -; SSE41-NEXT: movq %rdx, %rdi +; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: movq %rax, %r12 -; SSE41-NEXT: addq %rax, %rdi -; SSE41-NEXT: addq %rbx, %rdi +; SSE41-NEXT: addq %rax, %rsi +; SSE41-NEXT: addq %rbx, %rsi ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r13 @@ -3592,11 +3594,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: addq %r13, %rbx ; SSE41-NEXT: addq %rax, %rbx ; SSE41-NEXT: addq %r12, %r14 -; SSE41-NEXT: adcq %rdi, %rbx +; SSE41-NEXT: adcq %rsi, %rbx ; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rdx, %r15 @@ -3627,62 +3629,62 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: setne %r15b ; SSE41-NEXT: movq %rcx, %rdx ; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movq %rbp, %r11 -; SSE41-NEXT: imulq %rdx, %r11 -; SSE41-NEXT: movq %rsi, %rax +; SSE41-NEXT: movq %rbp, %r10 +; SSE41-NEXT: imulq %rdx, %r10 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %rdx ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %rax, %rbx ; SSE41-NEXT: addq %rax, %r9 -; SSE41-NEXT: addq %r11, %r9 +; SSE41-NEXT: addq %r10, %r9 ; SSE41-NEXT: movq %rbp, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r14 ; SSE41-NEXT: imulq %rcx, %r14 ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: addq %r14, %rbx -; SSE41-NEXT: addq %rax, %rbx -; SSE41-NEXT: addq %r10, %r11 -; SSE41-NEXT: adcq %r9, %rbx +; SSE41-NEXT: movq %rdx, %r10 +; SSE41-NEXT: addq %r14, %r10 +; SSE41-NEXT: addq %rax, %r10 +; SSE41-NEXT: addq %rbx, %r11 +; SSE41-NEXT: adcq %r9, %r10 ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: movq %rax, %r9 ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rdx, %rdi ; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %r9, %r14 -; SSE41-NEXT: adcq $0, %rsi +; SSE41-NEXT: addq %rbx, %r14 +; SSE41-NEXT: adcq $0, %rdi ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: movq %rdx, %r8 -; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %r14, %r9 -; SSE41-NEXT: adcq %rsi, %r8 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: addq %r14, %rbx +; SSE41-NEXT: adcq %rdi, %r8 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %esi +; SSE41-NEXT: movzbl %al, %edi ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: addq %r8, %rax -; SSE41-NEXT: adcq %rsi, %rdx +; SSE41-NEXT: adcq %rdi, %rdx ; SSE41-NEXT: addq %r11, %rax -; SSE41-NEXT: adcq %rbx, %rdx -; SSE41-NEXT: movq %r9, 24(%r12) -; SSE41-NEXT: sarq $63, %r9 -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: xorq %rax, %r9 +; SSE41-NEXT: adcq %r10, %rdx +; SSE41-NEXT: movq %rbx, 24(%r12) +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: xorq %rbx, %rdx +; SSE41-NEXT: xorq %rax, %rbx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %r9 +; SSE41-NEXT: orq %rdx, %rbx ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax ; SSE41-NEXT: negl %r15d ; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %r10, 16(%r12) -; SSE41-NEXT: movq %rdi, (%r12) +; SSE41-NEXT: movq %r9, 16(%r12) +; SSE41-NEXT: movq %rsi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3703,18 +3705,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rsi, %r11 ; AVX-NEXT: movq %rdi, %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movq %r11, %rdx +; AVX-NEXT: movq %rsi, %rdx ; AVX-NEXT: sarq $63, %rdx ; AVX-NEXT: movq %r9, %rbx ; AVX-NEXT: imulq %rdx, %rbx ; AVX-NEXT: movq %r15, %rax ; AVX-NEXT: mulq %rdx -; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: movq %rax, %r12 -; AVX-NEXT: addq %rax, %rdi -; AVX-NEXT: addq %rbx, %rdi +; AVX-NEXT: addq %rax, %rsi +; AVX-NEXT: addq %rbx, %rsi ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r13 @@ -3725,11 +3727,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: addq %r13, %rbx ; AVX-NEXT: addq %rax, %rbx ; AVX-NEXT: addq %r12, %r14 -; AVX-NEXT: adcq %rdi, %rbx +; AVX-NEXT: adcq %rsi, %rbx ; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rdx, %r12 -; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rdx, %r15 @@ -3760,62 +3762,62 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: setne %r15b ; AVX-NEXT: movq %rcx, %rdx ; AVX-NEXT: sarq $63, %rdx -; AVX-NEXT: movq %rbp, %r11 -; AVX-NEXT: imulq %rdx, %r11 -; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: movq %rbp, %r10 +; AVX-NEXT: imulq %rdx, %r10 +; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %rdx ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rax, %rbx ; AVX-NEXT: addq %rax, %r9 -; AVX-NEXT: addq %r11, %r9 +; AVX-NEXT: addq %r10, %r9 ; AVX-NEXT: movq %rbp, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r14 ; AVX-NEXT: imulq %rcx, %r14 ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: addq %r14, %rbx -; AVX-NEXT: addq %rax, %rbx -; AVX-NEXT: addq %r10, %r11 -; AVX-NEXT: adcq %r9, %rbx +; AVX-NEXT: movq %rdx, %r10 +; AVX-NEXT: addq %r14, %r10 +; AVX-NEXT: addq %rax, %r10 +; AVX-NEXT: addq %rbx, %r11 +; AVX-NEXT: adcq %r9, %r10 ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: movq %rax, %r9 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %r9, %r14 -; AVX-NEXT: adcq $0, %rsi +; AVX-NEXT: addq %rbx, %r14 +; AVX-NEXT: adcq $0, %rdi ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: movq %rdx, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %r14, %r9 -; AVX-NEXT: adcq %rsi, %r8 +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: addq %r14, %rbx +; AVX-NEXT: adcq %rdi, %r8 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %esi +; AVX-NEXT: movzbl %al, %edi ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: addq %r8, %rax -; AVX-NEXT: adcq %rsi, %rdx +; AVX-NEXT: adcq %rdi, %rdx ; AVX-NEXT: addq %r11, %rax -; AVX-NEXT: adcq %rbx, %rdx -; AVX-NEXT: movq %r9, 24(%r12) -; AVX-NEXT: sarq $63, %r9 -; AVX-NEXT: xorq %r9, %rdx -; AVX-NEXT: xorq %rax, %r9 +; AVX-NEXT: adcq %r10, %rdx +; AVX-NEXT: movq %rbx, 24(%r12) +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: xorq %rbx, %rdx +; AVX-NEXT: xorq %rax, %rbx ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %r9 +; AVX-NEXT: orq %rdx, %rbx ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax ; AVX-NEXT: negl %r15d ; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %r10, 16(%r12) -; AVX-NEXT: movq %rdi, (%r12) +; AVX-NEXT: movq %r9, 16(%r12) +; AVX-NEXT: movq %rsi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3897,7 +3899,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: movq %r8, %rax ; AVX512F-NEXT: mulq %rdx ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %rax, %r14 ; AVX512F-NEXT: addq %rax, %r10 ; AVX512F-NEXT: addq %rsi, %r10 ; AVX512F-NEXT: movq %rbp, %rax @@ -3906,26 +3908,26 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: imulq %r9, %rsi ; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: movq %rdx, %r14 -; AVX512F-NEXT: addq %rsi, %r14 -; AVX512F-NEXT: addq %rax, %r14 -; AVX512F-NEXT: addq %r11, %rbx -; AVX512F-NEXT: adcq %r10, %r14 +; AVX512F-NEXT: movq %rdx, %r11 +; AVX512F-NEXT: addq %rsi, %r11 +; AVX512F-NEXT: addq %rax, %r11 +; AVX512F-NEXT: addq %r14, %rbx +; AVX512F-NEXT: adcq %r10, %r11 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 ; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %r10, %r15 +; AVX512F-NEXT: addq %r14, %r15 ; AVX512F-NEXT: adcq $0, %r8 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r15, %r10 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: addq %r15, %r14 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al ; AVX512F-NEXT: movzbl %al, %esi @@ -3934,12 +3936,12 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rsi, %rdx ; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %r14, %rdx -; AVX512F-NEXT: movq %r10, 8(%r12) -; AVX512F-NEXT: sarq $63, %r10 -; AVX512F-NEXT: xorq %r10, %rdx -; AVX512F-NEXT: xorq %rax, %r10 -; AVX512F-NEXT: orq %rdx, %r10 +; AVX512F-NEXT: adcq %r11, %rdx +; AVX512F-NEXT: movq %r14, 8(%r12) +; AVX512F-NEXT: sarq $63, %r14 +; AVX512F-NEXT: xorq %r14, %rdx +; AVX512F-NEXT: xorq %rax, %r14 +; AVX512F-NEXT: orq %rdx, %r14 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3948,7 +3950,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512F-NEXT: movq %rcx, 16(%r12) -; AVX512F-NEXT: movq %r11, (%r12) +; AVX512F-NEXT: movq %r10, (%r12) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -4030,7 +4032,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: movq %r8, %rax ; AVX512BW-NEXT: mulq %rdx ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %rax, %r14 ; AVX512BW-NEXT: addq %rax, %r10 ; AVX512BW-NEXT: addq %rsi, %r10 ; AVX512BW-NEXT: movq %rbp, %rax @@ -4039,26 +4041,26 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: imulq %r9, %rsi ; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: movq %rdx, %r14 -; AVX512BW-NEXT: addq %rsi, %r14 -; AVX512BW-NEXT: addq %rax, %r14 -; AVX512BW-NEXT: addq %r11, %rbx -; AVX512BW-NEXT: adcq %r10, %r14 +; AVX512BW-NEXT: movq %rdx, %r11 +; AVX512BW-NEXT: addq %rsi, %r11 +; AVX512BW-NEXT: addq %rax, %r11 +; AVX512BW-NEXT: addq %r14, %rbx +; AVX512BW-NEXT: adcq %r10, %r11 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 ; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %r10, %r15 +; AVX512BW-NEXT: addq %r14, %r15 ; AVX512BW-NEXT: adcq $0, %r8 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r15, %r10 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: addq %r15, %r14 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: movzbl %al, %esi @@ -4067,12 +4069,12 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx ; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %r14, %rdx -; AVX512BW-NEXT: movq %r10, 8(%r12) -; AVX512BW-NEXT: sarq $63, %r10 -; AVX512BW-NEXT: xorq %r10, %rdx -; AVX512BW-NEXT: xorq %rax, %r10 -; AVX512BW-NEXT: orq %rdx, %r10 +; AVX512BW-NEXT: adcq %r11, %rdx +; AVX512BW-NEXT: movq %r14, 8(%r12) +; AVX512BW-NEXT: sarq $63, %r14 +; AVX512BW-NEXT: xorq %r14, %rdx +; AVX512BW-NEXT: xorq %rax, %r14 +; AVX512BW-NEXT: orq %rdx, %r14 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 @@ -4081,7 +4083,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512BW-NEXT: movq %rcx, 16(%r12) -; AVX512BW-NEXT: movq %r11, (%r12) +; AVX512BW-NEXT: movq %r10, (%r12) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 657e975a69440..653c3a9969151 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -222,19 +222,19 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -242,38 +242,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movq %xmm3, 16(%rcx) ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -281,19 +281,19 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, (%rcx) -; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, (%rcx) ; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: paddd %xmm2, %xmm3 ; SSSE3-NEXT: movq %xmm3, 16(%rcx) ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v6i32: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index b919bdb40c5f1..e929499c92cbd 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1621,85 +1621,85 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm8, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pand %xmm8, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm12, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pand %xmm8, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: packuswb %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm9, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm13, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: pand %xmm8, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: packuswb %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm2, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm14 +; SSE2-NEXT: pand %xmm8, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: packuswb %xmm14, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] ; SSE2-NEXT: movdqa %xmm3, %xmm14 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm13, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm6, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm14, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm7, %xmm8 +; SSE2-NEXT: packuswb %xmm6, %xmm8 ; SSE2-NEXT: psrlw $8, %xmm14 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: packuswb %xmm14, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm12 +; SSE2-NEXT: psrlw $8, %xmm13 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm12, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm13, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm12 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm11, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm10 +; SSE2-NEXT: packuswb %xmm12, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm11 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm10, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm2 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, 48(%rsi) -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm6, 32(%rsi) -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm9, 32(%rsi) +; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm5, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm4, (%rsi) @@ -1745,11 +1745,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 176(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: movdqa %xmm6, 144(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa %xmm8, 144(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 @@ -1760,11 +1760,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 112(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 80(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm1 @@ -1774,95 +1774,95 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 16(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: movdqa %xmm6, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v64i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm8, %xmm10 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm0, %xmm11 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm8, %xmm11 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pand %xmm8, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm11, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm12, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pand %xmm8, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSSE3-NEXT: packuswb %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm9, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm12, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm13 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm2, %xmm12 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm13, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, %xmm13 -; SSSE3-NEXT: pand %xmm8, %xmm13 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSSE3-NEXT: packuswb %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm13 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, %xmm14 +; SSSE3-NEXT: pand %xmm8, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: packuswb %xmm13, %xmm6 -; SSSE3-NEXT: movdqa %xmm7, %xmm13 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: packuswb %xmm14, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm3, %xmm14 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm13, %xmm14 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm6, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm14, %xmm7 -; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: movdqa %xmm14, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: packuswb %xmm7, %xmm8 +; SSSE3-NEXT: packuswb %xmm6, %xmm8 ; SSSE3-NEXT: psrlw $8, %xmm14 ; SSSE3-NEXT: psrlw $8, %xmm3 ; SSSE3-NEXT: packuswb %xmm14, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm12 +; SSSE3-NEXT: psrlw $8, %xmm13 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: packuswb %xmm12, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm13, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm12 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm11, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm10 +; SSSE3-NEXT: packuswb %xmm12, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm11 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: packuswb %xmm10, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: packuswb %xmm11, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, 32(%rsi) +; SSSE3-NEXT: movdqa %xmm2, %xmm8 ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, (%rsi) @@ -1908,11 +1908,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 176(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, 144(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm8 +; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 @@ -1923,11 +1923,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 112(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 80(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm1 @@ -1937,11 +1937,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 16(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v64i8: @@ -2094,32 +2094,32 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm8 -; AVX1-NEXT: vpackuswb %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm9 +; AVX1-NEXT: vpackuswb %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm8, %xmm10, %xmm8 -; AVX1-NEXT: vpand %xmm9, %xmm8, %xmm11 +; AVX1-NEXT: vpmullw %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm11 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10 -; AVX1-NEXT: vpand %xmm9, %xmm10, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm0 ; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11 -; AVX1-NEXT: vpand %xmm9, %xmm11, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12 -; AVX1-NEXT: vpand %xmm9, %xmm12, %xmm13 +; AVX1-NEXT: vpand %xmm7, %xmm12, %xmm13 ; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] @@ -2128,77 +2128,77 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm9, %xmm13, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpackuswb %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm9 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm13, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm7 +; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11 -; AVX1-NEXT: vpackuswb %xmm9, %xmm11, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm3, %xmm11, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 -; AVX1-NEXT: vpackuswb %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpackuswb %xmm9, %xmm10, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT: vpxor %xmm3, %xmm11, %xmm7 -; AVX1-NEXT: vpxor %xmm11, %xmm9, %xmm6 -; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm3 -; AVX1-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm9 +; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm1, %xmm10, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm5 +; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm3 +; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm1 +; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi) ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) ; AVX1-NEXT: vmovdqa %xmm4, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) ; AVX1-NEXT: vzeroupper @@ -2907,14 +2907,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r14 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r9, %r10 -; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: movq %r9, %r11 +; SSE2-NEXT: movq %rcx, %r10 ; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: testq %r10, %r10 +; SSE2-NEXT: testq %r11, %r11 ; SSE2-NEXT: setne %dl ; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: setne %bpl @@ -2922,32 +2922,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: seto %r15b -; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: seto %r12b ; SSE2-NEXT: orb %r15b, %r12b ; SSE2-NEXT: orb %bpl, %r12b -; SSE2-NEXT: leaq (%rsi,%rax), %r10 +; SSE2-NEXT: leaq (%rsi,%rax), %r11 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: addq %r10, %rsi -; SSE2-NEXT: setb %r10b -; SSE2-NEXT: orb %r12b, %r10b +; SSE2-NEXT: addq %r11, %rsi +; SSE2-NEXT: setb %r11b +; SSE2-NEXT: orb %r12b, %r11b ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al -; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setne %bpl ; SSE2-NEXT: andb %al, %bpl -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: seto %r11b +; SSE2-NEXT: seto %r10b ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: seto %r9b -; SSE2-NEXT: orb %r11b, %r9b +; SSE2-NEXT: orb %r10b, %r9b ; SSE2-NEXT: orb %bpl, %r9b ; SSE2-NEXT: addq %rax, %r8 ; SSE2-NEXT: movq %rcx, %rax @@ -2958,7 +2958,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzbl %r10b, %ecx +; SSE2-NEXT: movzbl %r11b, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2980,14 +2980,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r14 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r9, %r10 -; SSSE3-NEXT: movq %rcx, %r11 +; SSSE3-NEXT: movq %r9, %r11 +; SSSE3-NEXT: movq %rcx, %r10 ; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSSE3-NEXT: testq %r10, %r10 +; SSSE3-NEXT: testq %r11, %r11 ; SSSE3-NEXT: setne %dl ; SSSE3-NEXT: testq %rsi, %rsi ; SSSE3-NEXT: setne %bpl @@ -2995,32 +2995,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: seto %r15b -; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: seto %r12b ; SSSE3-NEXT: orb %r15b, %r12b ; SSSE3-NEXT: orb %bpl, %r12b -; SSSE3-NEXT: leaq (%rsi,%rax), %r10 +; SSSE3-NEXT: leaq (%rsi,%rax), %r11 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: addq %r10, %rsi -; SSSE3-NEXT: setb %r10b -; SSSE3-NEXT: orb %r12b, %r10b +; SSSE3-NEXT: addq %r11, %rsi +; SSSE3-NEXT: setb %r11b +; SSSE3-NEXT: orb %r12b, %r11b ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al -; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setne %bpl ; SSSE3-NEXT: andb %al, %bpl -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: seto %r11b +; SSSE3-NEXT: seto %r10b ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: seto %r9b -; SSSE3-NEXT: orb %r11b, %r9b +; SSSE3-NEXT: orb %r10b, %r9b ; SSSE3-NEXT: orb %bpl, %r9b ; SSSE3-NEXT: addq %rax, %r8 ; SSSE3-NEXT: movq %rcx, %rax @@ -3031,7 +3031,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzbl %r10b, %ecx +; SSSE3-NEXT: movzbl %r11b, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -3053,14 +3053,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r14 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r9, %r10 -; SSE41-NEXT: movq %rcx, %r11 +; SSE41-NEXT: movq %r9, %r11 +; SSE41-NEXT: movq %rcx, %r10 ; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: testq %r10, %r10 +; SSE41-NEXT: testq %r11, %r11 ; SSE41-NEXT: setne %dl ; SSE41-NEXT: testq %rsi, %rsi ; SSE41-NEXT: setne %bpl @@ -3068,32 +3068,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: seto %r15b -; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: seto %r12b ; SSE41-NEXT: orb %r15b, %r12b ; SSE41-NEXT: orb %bpl, %r12b -; SSE41-NEXT: leaq (%rsi,%rax), %r10 +; SSE41-NEXT: leaq (%rsi,%rax), %r11 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: addq %r10, %rsi -; SSE41-NEXT: setb %r10b -; SSE41-NEXT: orb %r12b, %r10b +; SSE41-NEXT: addq %r11, %rsi +; SSE41-NEXT: setb %r11b +; SSE41-NEXT: orb %r12b, %r11b ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al -; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setne %bpl ; SSE41-NEXT: andb %al, %bpl -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: movq %rax, %r8 -; SSE41-NEXT: seto %r11b +; SSE41-NEXT: seto %r10b ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: seto %r9b -; SSE41-NEXT: orb %r11b, %r9b +; SSE41-NEXT: orb %r10b, %r9b ; SSE41-NEXT: orb %bpl, %r9b ; SSE41-NEXT: addq %rax, %r8 ; SSE41-NEXT: movq %rcx, %rax @@ -3103,7 +3103,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: orb %r9b, %cl ; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx -; SSE41-NEXT: movzbl %r10b, %r8d +; SSE41-NEXT: movzbl %r11b, %r8d ; SSE41-NEXT: negl %r8d ; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 @@ -3125,14 +3125,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: movq %r9, %r11 +; AVX-NEXT: movq %rcx, %r10 ; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rsi, %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: testq %r10, %r10 +; AVX-NEXT: testq %r11, %r11 ; AVX-NEXT: setne %dl ; AVX-NEXT: testq %rsi, %rsi ; AVX-NEXT: setne %bpl @@ -3140,32 +3140,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: seto %r15b -; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %rdi ; AVX-NEXT: seto %r12b ; AVX-NEXT: orb %r15b, %r12b ; AVX-NEXT: orb %bpl, %r12b -; AVX-NEXT: leaq (%rsi,%rax), %r10 +; AVX-NEXT: leaq (%rsi,%rax), %r11 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: addq %r10, %rsi -; AVX-NEXT: setb %r10b -; AVX-NEXT: orb %r12b, %r10b +; AVX-NEXT: addq %r11, %rsi +; AVX-NEXT: setb %r11b +; AVX-NEXT: orb %r12b, %r11b ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al -; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: testq %r10, %r10 ; AVX-NEXT: setne %bpl ; AVX-NEXT: andb %al, %bpl -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: seto %r11b +; AVX-NEXT: seto %r10b ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %rcx ; AVX-NEXT: seto %r9b -; AVX-NEXT: orb %r11b, %r9b +; AVX-NEXT: orb %r10b, %r9b ; AVX-NEXT: orb %bpl, %r9b ; AVX-NEXT: addq %rax, %r8 ; AVX-NEXT: movq %rcx, %rax @@ -3175,7 +3175,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: orb %r9b, %cl ; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx -; AVX-NEXT: movzbl %r10b, %r8d +; AVX-NEXT: movzbl %r11b, %r8d ; AVX-NEXT: negl %r8d ; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index df5da63b50359..a58c3dd0d5307 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -240,35 +240,35 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %edx, %xmm2 ; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movd %r9d, %xmm1 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm4, (%rdi) @@ -281,35 +281,35 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm2 ; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: movd %r9d, %xmm1 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: psubd %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: psubd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psubd %xmm3, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm4, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 890514fbdc022..6d71564dd57f9 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -1404,38 +1404,38 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fadd_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [-0.0E+0,-0.0E+0] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE42-NEXT: addpd %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: addpd %xmm9, %xmm10 ; SSE42-NEXT: addpd %xmm6, %xmm1 ; SSE42-NEXT: addpd %xmm7, %xmm2 ; SSE42-NEXT: addpd %xmm11, %xmm3 -; SSE42-NEXT: movapd %xmm9, %xmm0 +; SSE42-NEXT: movapd %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fadd_v8f64_cast_cond: @@ -1734,38 +1734,38 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fmul_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1.0E+0,1.0E+0] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE42-NEXT: mulpd %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: mulpd %xmm9, %xmm10 ; SSE42-NEXT: mulpd %xmm6, %xmm1 ; SSE42-NEXT: mulpd %xmm7, %xmm2 ; SSE42-NEXT: mulpd %xmm11, %xmm3 -; SSE42-NEXT: movapd %xmm9, %xmm0 +; SSE42-NEXT: movapd %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fmul_v8f64_cast_cond: @@ -1922,38 +1922,38 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fdiv_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm11, %xmm10 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm9 -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm11 -; SSE42-NEXT: divpd %xmm11, %xmm8 +; SSE42-NEXT: divpd %xmm11, %xmm9 ; SSE42-NEXT: divpd %xmm6, %xmm1 ; SSE42-NEXT: divpd %xmm7, %xmm2 ; SSE42-NEXT: divpd %xmm10, %xmm3 -; SSE42-NEXT: movapd %xmm8, %xmm0 +; SSE42-NEXT: movapd %xmm9, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fdiv_v8f64_cast_cond: @@ -2982,43 +2982,43 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; ; SSE42-LABEL: mul_v8i64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1,1] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 -; SSE42-NEXT: pmuludq %xmm9, %xmm0 -; SSE42-NEXT: movdqa %xmm9, %xmm4 +; SSE42-NEXT: pmuludq %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm4 ; SSE42-NEXT: psrlq $32, %xmm4 -; SSE42-NEXT: pmuludq %xmm8, %xmm4 +; SSE42-NEXT: pmuludq %xmm9, %xmm4 ; SSE42-NEXT: paddq %xmm0, %xmm4 ; SSE42-NEXT: psllq $32, %xmm4 -; SSE42-NEXT: pmuludq %xmm8, %xmm9 -; SSE42-NEXT: paddq %xmm4, %xmm9 +; SSE42-NEXT: pmuludq %xmm9, %xmm10 +; SSE42-NEXT: paddq %xmm4, %xmm10 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 ; SSE42-NEXT: pmuludq %xmm6, %xmm0 @@ -3049,7 +3049,7 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psllq $32, %xmm4 ; SSE42-NEXT: pmuludq %xmm11, %xmm3 ; SSE42-NEXT: paddq %xmm4, %xmm3 -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: mul_v8i64_cast_cond: diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 2c7f65064a5e6..f4d6b52377f57 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -1742,35 +1742,35 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; SSE-NEXT: packssdw %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: cmpltpd %xmm0, %xmm4 +; SSE-NEXT: packssdw %xmm2, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: packssdw %xmm3, %xmm1 +; SSE-NEXT: packssdw %xmm3, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: packsswb %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: packsswb %xmm5, %xmm1 +; SSE-NEXT: pmovmskb %xmm4, %ecx ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pmovmskb %xmm1, %ecx -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: packssdw %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: packssdw %xmm2, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm3, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %edx ; SSE-NEXT: shll $16, %edx ; SSE-NEXT: orl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 818480854c9df..ed6f5007b77e3 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -54,12 +54,12 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -105,12 +105,12 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -480,12 +480,12 @@ define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictf ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -895,12 +895,12 @@ define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1318,12 +1318,12 @@ define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 28241da6506a9..fcc68f09c1d20 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -503,56 +503,55 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movdqa 96(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pslld $16, %xmm12 +; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: packssdw %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrad $16, %xmm10 ; SSE-NEXT: packssdw %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm9 ; SSE-NEXT: psrad $16, %xmm9 ; SSE-NEXT: packssdw %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrad $16, %xmm11 ; SSE-NEXT: packssdw %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm13 ; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: packssdw %xmm0, %xmm13 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 @@ -597,9 +596,9 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: packssdw %xmm3, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: packssdw %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -607,9 +606,10 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: packssdw %xmm0, %xmm3 -; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: packssdw %xmm12, %xmm7 +; SSE-NEXT: packssdw %xmm0, %xmm7 ; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: packssdw %xmm8, %xmm2 @@ -620,8 +620,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa %xmm11, 64(%rsi) ; SSE-NEXT: movdqa %xmm9, (%rsi) ; SSE-NEXT: movdqa %xmm10, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm12, 16(%rsi) ; SSE-NEXT: movdqa %xmm2, 96(%rdx) ; SSE-NEXT: movdqa %xmm7, 112(%rdx) ; SSE-NEXT: movdqa %xmm3, 64(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1967248590bc1..2fdc04bd73455 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -454,14 +454,14 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] @@ -473,82 +473,82 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] ; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] -; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm12 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,0] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm6, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movdqa %xmm13, 16(%rdx) -; SSE-NEXT: movdqa %xmm12, (%rdx) -; SSE-NEXT: movdqa %xmm9, 16(%rcx) -; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm11, 16(%rdx) +; SSE-NEXT: movdqa %xmm9, (%rdx) +; SSE-NEXT: movdqa %xmm8, 16(%rcx) +; SSE-NEXT: movdqa %xmm10, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf16: @@ -709,23 +709,23 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 144(%rdi), %xmm13 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -733,16 +733,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] @@ -750,18 +751,20 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] @@ -771,17 +774,16 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] @@ -790,36 +792,56 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: pandn %xmm13, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm0, %xmm13 @@ -831,116 +853,95 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) +; SSE-NEXT: movdqa %xmm15, (%rdx) +; SSE-NEXT: movdqa %xmm10, 16(%rdx) ; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) ; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: movdqa %xmm4, 16(%rcx) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf32: @@ -952,24 +953,24 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,3,2,3,4,5,6,7] @@ -979,10 +980,10 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] @@ -990,59 +991,59 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -1246,909 +1247,905 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa 256(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,1] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] +; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm1[2,0] -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa 144(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pandn (%rsp), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,2] -; SSE-NEXT: pandn %xmm13, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,2] +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm8, 32(%rdx) +; SSE-NEXT: movdqa %xmm6, 32(%rdx) ; SSE-NEXT: movdqa %xmm10, 112(%rdx) -; SSE-NEXT: movdqa %xmm14, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movdqa %xmm6, 96(%rcx) -; SSE-NEXT: movdqa %xmm11, 112(%rcx) -; SSE-NEXT: movdqa %xmm0, 64(%rcx) -; SSE-NEXT: movdqa %xmm15, 80(%rcx) -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm4, 48(%rcx) -; SSE-NEXT: movdqa %xmm9, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%rcx) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa %xmm15, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm13, 96(%rcx) +; SSE-NEXT: movdqa %xmm9, 112(%rcx) +; SSE-NEXT: movdqa %xmm11, 64(%rcx) +; SSE-NEXT: movdqa %xmm14, 80(%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3,4],xmm10[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm0[2],xmm15[3,4],xmm0[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm15[2],mem[3,4],xmm15[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0,1],mem[2],xmm8[3,4],mem[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[0,1],mem[2],xmm1[3,4],mem[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm5[0,1],mem[2],xmm5[3,4],mem[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm8[2],xmm12[3,4],xmm8[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm3[2],xmm9[3,4],xmm3[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm8[2],mem[3,4],xmm8[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm9[2],mem[3,4],xmm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm15[1],xmm13[2,3],xmm15[4],xmm13[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm3[1],mem[2,3],xmm3[4],mem[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd $230, (%rsp), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3],xmm6[4],mem[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3],mem[4],xmm7[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3],xmm8[4],mem[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rcx) -; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride3_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $168, %rsp +; AVX2-ONLY-NEXT: subq $136, %rsp ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14],ymm4[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm12, %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm9, %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm5, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6],ymm0[7],ymm15[8],ymm0[9],ymm15[10,11],ymm0[12],ymm15[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm15[2],xmm6[3,4],xmm15[5],xmm6[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6],ymm0[7],ymm10[8],ymm0[9],ymm10[10,11],ymm0[12],ymm10[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6],ymm0[7],ymm11[8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm1, %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7,8,9],ymm4[10],ymm11[11,12],ymm4[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm11[3,4,5,6,7],ymm4[8,9,10],ymm11[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm11[1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7],ymm8[8],ymm11[9,10],ymm8[11],ymm11[12,13],ymm8[14],ymm11[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2,3],xmm15[4],xmm6[5,6],xmm15[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: addq $168, %rsp +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2162,17 +2159,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] ; AVX512F-NEXT: vmovdqa %xmm2, %xmm14 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm6 @@ -2189,35 +2186,35 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm10 -; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13,14],ymm12[15] -; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512F-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm11 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512F-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm10[3,4,5,6,7] -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm2 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512F-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512F-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] ; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] @@ -2248,7 +2245,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] @@ -2260,11 +2257,11 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] ; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] @@ -2274,12 +2271,12 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512F-NEXT: vpternlogq $226, %ymm16, %ymm0, %ymm15 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm15[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] +; AVX512F-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6],xmm5[7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -2315,9 +2312,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 9b347e01e92d7..6670de17c6cde 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -549,15 +549,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] @@ -578,7 +578,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7] @@ -588,30 +588,30 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] @@ -627,13 +627,13 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movapd %xmm4, 16(%rsi) ; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm8, 16(%rdx) +; SSE-NEXT: movapd %xmm9, 16(%rdx) ; SSE-NEXT: movapd %xmm5, (%rdx) ; SSE-NEXT: movapd %xmm15, 16(%rcx) ; SSE-NEXT: movapd %xmm6, (%rcx) @@ -1189,15 +1189,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1212,17 +1212,17 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -1232,8 +1232,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1255,26 +1255,27 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] @@ -1284,29 +1285,29 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] @@ -1314,26 +1315,25 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -1343,48 +1343,47 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1394,204 +1393,211 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movapd %xmm12, 32(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm15, 32(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: movapd %xmm1, 32(%rcx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm15, 48(%rcx) ; SSE-NEXT: movapd %xmm10, 16(%rcx) -; SSE-NEXT: movapd %xmm2, 32(%r8) +; SSE-NEXT: movapd %xmm3, 32(%r8) ; SSE-NEXT: movapd %xmm7, (%r8) ; SSE-NEXT: movapd %xmm14, 48(%r8) -; SSE-NEXT: movapd %xmm3, 16(%r8) +; SSE-NEXT: movapd %xmm11, 16(%r8) ; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $232, %rsp -; AVX1-ONLY-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm5[1,2,3],xmm11[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm5[1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm5[1,2,3],xmm13[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm5[1,2,3],xmm14[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm5[1,2,3],xmm9[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3],xmm8[4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm15, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1599,33 +1605,33 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1640,18 +1646,18 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $232, %rsp +; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp +; AVX2-SLOW-NEXT: subq $168, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1693,141 +1699,139 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1835,36 +1839,36 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1874,26 +1878,25 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $168, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $136, %rsp +; AVX2-FAST-NEXT: subq $104, %rsp ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1903,13 +1906,14 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1920,131 +1924,132 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FAST-NEXT: addq $136, %rsp +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -2086,185 +2091,187 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX512F-SLOW-LABEL: load_i16_stride4_vf32: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: load_i16_stride4_vf32: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm6 @@ -2544,7 +2551,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2630,7 +2637,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2793,643 +2800,645 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1] +; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm5[0],xmm12[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm5[0],xmm15[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) -; SSE-NEXT: movapd %xmm7, 32(%rcx) -; SSE-NEXT: movapd %xmm10, 112(%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movapd %xmm2, 96(%rcx) +; SSE-NEXT: movapd %xmm5, 32(%rcx) +; SSE-NEXT: movapd %xmm9, 112(%rcx) +; SSE-NEXT: movapd %xmm10, 48(%rcx) ; SSE-NEXT: movapd %xmm14, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movapd %xmm9, 112(%r8) -; SSE-NEXT: movapd %xmm6, 96(%r8) -; SSE-NEXT: movapd %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movapd %xmm7, 112(%r8) +; SSE-NEXT: movapd %xmm4, 96(%r8) +; SSE-NEXT: movapd %xmm3, 80(%r8) ; SSE-NEXT: movapd %xmm15, 64(%r8) ; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm1, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm4, (%r8) +; SSE-NEXT: movapd %xmm13, 32(%r8) +; SSE-NEXT: movapd %xmm8, 16(%r8) +; SSE-NEXT: movapd %xmm0, (%r8) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $824, %rsp # imm = 0x338 -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm7[1,2,3],xmm4[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2,3],xmm2[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm7[1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm7[1,2,3],xmm13[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1,2,3],xmm14[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm7[1,2,3],xmm3[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm7[1,2,3],xmm5[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0],xmm7[1,2,3],xmm8[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm7[1,2,3],xmm12[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -3470,12 +3479,12 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] @@ -3508,8 +3517,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] @@ -3536,7 +3544,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] @@ -3546,8 +3555,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3588,7 +3597,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: addq $824, %rsp # imm = 0x338 +; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3681,7 +3690,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 @@ -3695,7 +3704,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 @@ -3739,20 +3748,20 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3792,45 +3801,45 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] @@ -3839,36 +3848,36 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] @@ -3877,182 +3886,171 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] @@ -4062,47 +4060,58 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r8) ; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 @@ -4120,11 +4129,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4135,235 +4143,232 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] @@ -4372,87 +4377,90 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -4534,266 +4542,266 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -4801,8 +4809,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4826,11 +4834,11 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4844,376 +4852,369 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $88, %rsp -; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: subq $104, %rsp +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm27[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm19[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm21[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm24 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm29[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm21[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm18[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm27[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm20[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpmovqw %ymm1, %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm13[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-SLOW-NEXT: vpmovqw %ymm14, %xmm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 -; AVX512F-SLOW-NEXT: vpmovqw %zmm21, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm2, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-SLOW-NEXT: vpmovqw %zmm29, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpmovqw %ymm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm25 +; AVX512F-SLOW-NEXT: vpmovqw %zmm25, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm29, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm29, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm8[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm4, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-SLOW-NEXT: addq $88, %rsp +; AVX512F-SLOW-NEXT: addq $104, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 8bda8ab81eac6..30ee74b335752 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -489,35 +489,35 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] @@ -527,64 +527,65 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm7, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm7 ; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,1,1,3] -; SSE-NEXT: psllq $48, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3] +; SSE-NEXT: psllq $48, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm6, %xmm13 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,0] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm4, (%rsi) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movaps %xmm12, (%rcx) -; SSE-NEXT: movaps %xmm14, (%r8) -; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE-NEXT: movdqa %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf8: @@ -907,72 +908,63 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 32(%rdi), %xmm11 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] -; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -980,23 +972,23 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: andnps %xmm2, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1005,43 +997,42 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; SSE-NEXT: pand %xmm3, %xmm15 ; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] @@ -1049,76 +1040,72 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm11[3,0] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] ; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] @@ -1128,15 +1115,15 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movdqa %xmm6, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps %xmm13, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm3, 16(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf16: @@ -1759,86 +1746,89 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm10 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm1 @@ -1853,30 +1843,30 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] @@ -1885,25 +1875,22 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm15, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1911,24 +1898,25 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1939,19 +1927,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1962,331 +1950,339 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: andnps %xmm14, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: andnps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm11[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0] +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps %xmm12, 16(%r8) -; SSE-NEXT: movaps %xmm6, 48(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movaps %xmm14, 48(%r9) -; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm10, 16(%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) +; SSE-NEXT: movaps %xmm13, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm3, 32(%r9) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2300,15 +2296,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -2317,234 +2313,237 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm10[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm12[4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm11[2,3],xmm15[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm10[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm13[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm3[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm12[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm9[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -2553,13 +2552,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 @@ -2568,174 +2567,171 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4],ymm8[5],ymm3[6,7],ymm8[8],ymm3[9,10],ymm8[11],ymm3[12],ymm8[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6],xmm13[7] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7],ymm14[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm13[2],xmm7[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm6[2],ymm1[3],ymm6[4],ymm1[5,6],ymm6[7],ymm1[8,9],ymm6[10],ymm1[11],ymm6[12],ymm1[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7,8],ymm2[9],ymm11[10],ymm2[11],ymm11[12,13],ymm2[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm12[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm1[1,2],ymm6[3],ymm1[4],ymm6[5],ymm1[6,7],ymm6[8],ymm1[9,10],ymm6[11],ymm1[12],ymm6[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] @@ -2744,215 +2740,217 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) -; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7],ymm10[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10,11],ymm7[12],ymm3[13],ymm7[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> @@ -2964,20 +2962,20 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2992,233 +2990,230 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5],ymm2[6],ymm7[7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13],ymm2[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2,3],xmm1[4,5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4],ymm11[5],ymm6[6,7],ymm11[8],ymm6[9,10],ymm11[11],ymm6[12],ymm11[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm8[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0],xmm9[1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11],ymm10[12],ymm6[13],ymm10[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm5[2],mem[3],ymm5[4],mem[5,6],ymm5[7],mem[8,9],ymm5[10],mem[11],ymm5[12],mem[13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4],ymm7[5],ymm3[6,7],ymm7[8],ymm3[9,10],ymm7[11],ymm3[12],ymm7[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3],ymm6[4],ymm10[5,6],ymm6[7],ymm10[8,9],ymm6[10],ymm10[11],ymm6[12],ymm10[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -3657,173 +3652,170 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1000, %rsp # imm = 0x3E8 -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm5 +; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa 400(%rdi), %xmm8 +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 320(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm6, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm2 @@ -3832,27 +3824,27 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 528(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] @@ -3867,802 +3859,800 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 480(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: psllq $48, %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm3, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psllq $48, %xmm4 +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm4, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psllq $48, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: andnps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm13, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm10, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm14, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm12, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm1, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm14[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: andnps %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm12[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm9, 96(%r8) -; SSE-NEXT: movaps %xmm11, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps %xmm15, 112(%r9) -; SSE-NEXT: movaps %xmm0, 96(%r9) -; SSE-NEXT: movaps %xmm1, 80(%r9) -; SSE-NEXT: movaps %xmm2, 64(%r9) -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps %xmm5, 32(%r9) -; SSE-NEXT: movaps %xmm6, 16(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: addq $1000, %rsp # imm = 0x3E8 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: orps %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%r8) +; SSE-NEXT: movaps %xmm14, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm6, 112(%r9) +; SSE-NEXT: movaps %xmm2, 96(%r9) +; SSE-NEXT: movaps %xmm3, 80(%r9) +; SSE-NEXT: movaps %xmm4, 64(%r9) +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm7, 16(%r9) +; SSE-NEXT: movaps %xmm8, (%r9) +; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf64: @@ -4679,13 +4669,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] @@ -4695,9 +4684,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4705,12 +4694,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4718,10 +4707,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] @@ -4730,7 +4718,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] @@ -4738,74 +4726,75 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4831,144 +4820,143 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm11[2,3],mem[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1,2,3],xmm12[4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4976,101 +4964,101 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1,2,3],mem[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm10[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm14[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5079,104 +5067,104 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm7[2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1,2,3],xmm14[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3],mem[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm9[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5196,115 +5184,114 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm12[0,1,2],mem[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5330,413 +5317,392 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $1048, %rsp # imm = 0x418 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4],ymm13[5],ymm9[6,7],ymm13[8],ymm9[9,10],ymm13[11],ymm9[12],ymm13[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2],mem[3],ymm1[4,5],mem[6],ymm1[7,8],mem[9],ymm1[10],mem[11],ymm1[12,13],mem[14],ymm1[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10],ymm15[11],ymm7[12,13],ymm15[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2,3,4],ymm13[5,6,7],ymm2[8,9,10,11,12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm4[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm12[2],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm9[2],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5],ymm5[6],ymm15[7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13],ymm5[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm12[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0],xmm4[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm13[1,2],ymm9[3],ymm13[4],ymm9[5],ymm13[6,7],ymm9[8],ymm13[9,10],ymm9[11],ymm13[12],ymm9[13],ymm13[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm15[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm14[1],ymm10[2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10],ymm14[11],ymm10[12,13],ymm14[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm12[1,2],mem[3],ymm12[4],mem[5],ymm12[6,7],mem[8],ymm12[9,10],mem[11],ymm12[12],mem[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm13[0,1],mem[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[1],ymm12[2],mem[3],ymm12[4,5],mem[6],ymm12[7,8],mem[9],ymm12[10],mem[11],ymm12[12,13],mem[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4],ymm14[5],ymm8[6,7],ymm14[8],ymm8[9,10],ymm14[11],ymm8[12],ymm14[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] @@ -5744,475 +5710,496 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm10[0,1],mem[2],ymm10[3],mem[4],ymm10[5,6],mem[7],ymm10[8,9],mem[10],ymm10[11],mem[12],ymm10[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX2-SLOW-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4],ymm14[5],ymm4[6,7],ymm14[8],ymm4[9,10],ymm14[11],ymm4[12],ymm14[13],ymm4[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13],ymm0[14],ymm6[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10],ymm10[11],ymm13[12,13],ymm10[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5,6],xmm10[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm5[0],mem[1],ymm5[2],mem[3],ymm5[4,5],mem[6],ymm5[7,8],mem[9],ymm5[10],mem[11],ymm5[12,13],mem[14],ymm5[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3],xmm11[4,5,6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5,6,7],ymm10[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, (%rsp), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5],mem[6],ymm12[7,8],mem[9],ymm12[10,11],mem[12],ymm12[13],mem[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm15[2],ymm14[3],ymm15[4],ymm14[5,6],ymm15[7],ymm14[8,9],ymm15[10],ymm14[11],ymm15[12],ymm14[13,14],ymm15[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm8[2],ymm4[3],ymm8[4],ymm4[5,6],ymm8[7],ymm4[8,9],ymm8[10],ymm4[11],ymm8[12],ymm4[13,14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $214, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm3[0],mem[1,2],ymm3[3],mem[4],ymm3[5],mem[6,7],ymm3[8],mem[9,10],ymm3[11],mem[12],ymm3[13],mem[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7,8],ymm12[9],ymm5[10],ymm12[11],ymm5[12,13],ymm12[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm3[1,2],mem[3],ymm3[4],mem[5],ymm3[6,7],mem[8],ymm3[9,10],mem[11],ymm3[12],mem[13],ymm3[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4],ymm8[5],ymm4[6,7],ymm8[8],ymm4[9,10],ymm8[11],ymm4[12],ymm8[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) @@ -6234,36 +6221,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: subq $1080, %rsp # imm = 0x438 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 @@ -6272,420 +6258,428 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4],ymm15[5],ymm8[6,7],ymm15[8],ymm8[9,10],ymm15[11],ymm8[12],ymm15[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm4[1,2],ymm12[3],ymm4[4],ymm12[5],ymm4[6,7],ymm12[8],ymm4[9,10],ymm12[11],ymm4[12],ymm12[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4],ymm10[5],ymm0[6,7],ymm10[8],ymm0[9,10],ymm10[11],ymm0[12],ymm10[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4,5],ymm1[6],mem[7,8],ymm1[9],mem[10],ymm1[11],mem[12,13],ymm1[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm9[0],mem[1],ymm9[2],mem[3],ymm9[4,5],mem[6],ymm9[7,8],mem[9],ymm9[10],mem[11],ymm9[12,13],mem[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm12[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm13[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm15[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm13[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm6[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0],xmm14[1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm15[0],mem[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1,2],mem[3],ymm1[4],mem[5],ymm1[6,7],mem[8],ymm1[9,10],mem[11],ymm1[12],mem[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[0],mem[1],ymm6[2],mem[3],ymm6[4,5],mem[6],ymm6[7,8],mem[9],ymm6[10],mem[11],ymm6[12,13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm8[1,2],mem[3],ymm8[4],mem[5],ymm8[6,7],mem[8],ymm8[9,10],mem[11],ymm8[12],mem[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm12[0,1],mem[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm13[1,2],ymm11[3],ymm13[4],ymm11[5],ymm13[6,7],ymm11[8],ymm13[9,10],ymm11[11],ymm13[12],ymm11[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm7[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm10[2],ymm8[3],ymm10[4],ymm8[5,6],ymm10[7],ymm8[8,9],ymm10[10],ymm8[11],ymm10[12],ymm8[13,14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3],ymm6[4],mem[5,6],ymm6[7],mem[8,9],ymm6[10],mem[11],ymm6[12],mem[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -6715,28 +6709,29 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] @@ -6745,92 +6740,91 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7],ymm0[8,9],ymm12[10],ymm0[11],ymm12[12],ymm0[13,14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm24 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm7[2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm5 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 @@ -6841,782 +6835,792 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0],xmm11[1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm13[2],ymm1[3],ymm13[4],ymm1[5,6],ymm13[7],ymm1[8,9],ymm13[10],ymm1[11],ymm13[12],ymm1[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm13[1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm28[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm25, %zmm24 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1,2],ymm15[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm24[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm10[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm11[2],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13],ymm11[14],ymm6[15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm15 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4],ymm13[5],ymm6[6,7],ymm13[8],ymm6[9,10],ymm13[11],ymm6[12],ymm13[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm28[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm13 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5],ymm3[6],ymm12[7,8],ymm3[9],ymm12[10,11],ymm3[12],ymm12[13],ymm3[14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm21[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm19, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm10[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm24, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm7[2],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5],ymm5[6],ymm11[7,8],ymm5[9],ymm11[10,11],ymm5[12],ymm11[13],ymm5[14],ymm11[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm14[2],ymm10[3],ymm14[4],ymm10[5,6],ymm14[7],ymm10[8,9],ymm14[10],ymm10[11],ymm14[12],ymm10[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm8[1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm16[2],xmm12[3],xmm16[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4],xmm13[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm27[2],xmm15[3],xmm27[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm24[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm11 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4],ymm14[5],ymm10[6,7],ymm14[8],ymm10[9,10],ymm14[11],ymm10[12],ymm14[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm0[2],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3],xmm2[4,5],xmm9[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6,7],ymm9[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm24 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1],ymm5[2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7,8],ymm11[9],ymm5[10],ymm11[11],ymm5[12,13],ymm11[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm5[2],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4,5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4],ymm3[5],ymm0[6,7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12],ymm3[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6],xmm6[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 ; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [8,9,3,2,4,5,7,6] -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm19, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,1,3,0,3,5,7] -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm25, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4],ymm15[5],ymm4[6,7],ymm15[8],ymm4[9,10],ymm15[11],ymm4[12],ymm15[13],ymm4[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm15 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %ymm20, %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm30 +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,7,1,6> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vporq %ymm3, %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm21, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm20, %ymm10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] +; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,3,2,3,1,3,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm25, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm26, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vpblendw $74, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm22, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm21, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm29, %xmm4 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm21, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm27, %xmm4 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12],ymm4[13],ymm6[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm16, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm21 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm30 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm26, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm13, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm25, %zmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm24, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm10 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm29[2],xmm2[3],xmm29[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3],ymm15[4],ymm3[5,6],ymm15[7],ymm3[8,9],ymm15[10],ymm3[11],ymm15[12],ymm3[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm29 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10,11],ymm4[12],ymm13[13],ymm4[14],ymm13[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm30 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3],ymm8[4],ymm6[5,6],ymm8[7],ymm6[8,9],ymm8[10],ymm6[11],ymm8[12],ymm6[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm20 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm9[2],xmm7[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm26, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm17 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm15[1,2],ymm4[3],ymm15[4],ymm4[5],ymm15[6,7],ymm4[8],ymm15[9,10],ymm4[11],ymm15[12],ymm4[13],ymm15[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm10[1],ymm5[2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10],ymm10[11],ymm5[12,13],ymm10[14],ymm5[15] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0],xmm13[1,2,3],xmm3[4,5],xmm13[6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm24, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4],ymm15[5],ymm12[6,7],ymm15[8],ymm12[9,10],ymm15[11],ymm12[12],ymm15[13],ymm12[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm3[1],ymm15[2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm26, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm27, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10,11],ymm6[12],ymm15[13],ymm6[14],ymm15[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] ; AVX512F-FAST-NEXT: movb $7, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm6, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2],ymm5[3],mem[4,5],ymm5[6],mem[7,8],ymm5[9],mem[10],ymm5[11],mem[12,13],ymm5[14],mem[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6],xmm8[7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0,1],ymm3[2],ymm15[3],ymm3[4],ymm15[5,6],ymm3[7],ymm15[8,9],ymm3[10],ymm15[11],ymm3[12],ymm15[13,14],ymm3[15] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm6[2],ymm13[3],ymm6[4],ymm13[5,6],ymm6[7],ymm13[8,9],ymm6[10],ymm13[11],ymm6[12],ymm13[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 @@ -7637,11 +7641,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-FAST-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 993029374b700..57fab33b13fa3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -277,19 +277,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero @@ -304,8 +304,8 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] @@ -322,8 +322,8 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm3, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm9, (%rcx) ; SSE-NEXT: movq %xmm2, (%r8) ; SSE-NEXT: movq %xmm6, (%r9) @@ -570,118 +570,118 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i16_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] -; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] +; SSE-NEXT: pslld $16, %xmm8 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] ; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm13 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm12, %xmm8 +; SSE-NEXT: por %xmm14, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm12[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] +; SSE-NEXT: andps %xmm6, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm14, (%rcx) -; SSE-NEXT: movdqa %xmm8, (%r8) -; SSE-NEXT: movdqa %xmm11, (%r9) -; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm6, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm9, (%r8) +; SSE-NEXT: movdqa %xmm12, (%r9) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf8: @@ -1108,289 +1108,295 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3] +; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: psrld $16, %xmm13 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[2,0] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm1[0],xmm15[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm12, %xmm15 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd $196, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm15[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: psrlq $48, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm9, (%rdx) -; SSE-NEXT: movdqa %xmm11, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%r8) -; SSE-NEXT: movdqa %xmm7, (%r8) -; SSE-NEXT: movdqa %xmm1, 16(%r9) -; SSE-NEXT: movdqa %xmm2, (%r9) +; SSE-NEXT: movdqa %xmm7, 16(%r8) +; SSE-NEXT: movdqa %xmm11, (%r8) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm12, 16(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) -; SSE-NEXT: addq $104, %rsp +; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $104, %rsp +; AVX1-ONLY-NEXT: subq $88, %rsp ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 @@ -1398,21 +1404,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpslld $16, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 @@ -1427,28 +1434,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] @@ -1461,17 +1468,17 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7] @@ -1483,22 +1490,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] @@ -1510,46 +1517,47 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 @@ -1558,14 +1566,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] @@ -1575,44 +1583,44 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $104, %rsp +; AVX1-ONLY-NEXT: addq $88, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1620,11 +1628,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1634,7 +1642,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1661,73 +1669,73 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1735,11 +1743,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1748,7 +1756,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1772,70 +1780,70 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1843,11 +1851,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1856,7 +1864,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1880,40 +1888,40 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2209,305 +2217,274 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $488, %rsp # imm = 0x1E8 -; SSE-NEXT: movdqa 304(%rdi), %xmm5 -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 +; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm4 -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm5 +; SSE-NEXT: movdqa 272(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm1 -; SSE-NEXT: movdqa 176(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm11 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,0] +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm5[0] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] @@ -2515,313 +2492,341 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm8, %xmm15 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%r8) -; SSE-NEXT: movdqa %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm3, 16(%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: movdqa %xmm5, 48(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm15, 48(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: addq $488, %rsp # imm = 0x1E8 +; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm1, 48(%rax) +; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $536, %rsp # imm = 0x218 +; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 @@ -2833,6 +2838,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -2842,9 +2848,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] @@ -2855,47 +2861,48 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpslld $16, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 @@ -2904,200 +2911,200 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm8[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm9, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm12[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3106,125 +3113,129 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, (%rsp), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -3241,13 +3252,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: addq $536, %rsp # imm = 0x218 +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3260,43 +3271,44 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] @@ -3304,235 +3316,237 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3],xmm11[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,0,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm8[1],ymm4[2,3,4,5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3],xmm12[4,5],xmm4[6],xmm12[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5],xmm11[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3],xmm15[4,5],xmm10[6],xmm15[7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3],xmm13[4,5],xmm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2],ymm10[3,4,5,6,7],ymm6[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1,2],xmm12[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1,2],xmm12[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2],ymm5[3,4,5,6,7],ymm12[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: vzeroupper @@ -3542,227 +3556,231 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm3 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] @@ -3770,30 +3788,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -3804,13 +3822,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-NEXT: vzeroupper @@ -3820,227 +3838,231 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] @@ -4048,30 +4070,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -4082,13 +4104,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4096,102 +4118,101 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: subq $136, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3],xmm5[4,5],xmm9[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5],xmm9[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] @@ -4200,8 +4221,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] @@ -4211,26 +4232,26 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] @@ -4239,9 +4260,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] @@ -4261,12 +4282,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -4282,358 +4303,356 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3,4,5],ymm12[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm30, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm19, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: popq %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $136, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm2[1],ymm11[2,3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm16[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5,6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm13, %ymm4 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] @@ -4641,18 +4660,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -4661,103 +4680,103 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5],xmm8[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3],xmm9[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 ; AVX512DQ-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3],xmm8[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3],xmm7[4,5],xmm15[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -4766,9 +4785,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] @@ -4778,44 +4797,44 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] @@ -4827,93 +4846,93 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm5 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3],xmm8[4],xmm14[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4],xmm9[5],xmm14[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm4, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] @@ -4926,11 +4945,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm9 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -4938,272 +4957,270 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: pushq %rax ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3],xmm8[4,5],xmm0[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm13 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm28 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm30 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2],xmm0[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm9[2],ymm12[3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm17[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm13, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm12, %ymm6 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -5313,30 +5330,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1160, %rsp # imm = 0x488 +; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movdqa 496(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa 512(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -5353,14 +5370,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 480(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5368,22 +5385,22 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 544(%rdi), %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 ; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -5398,28 +5415,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5437,7 +5456,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 400(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5447,21 +5466,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -5476,7 +5495,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5486,21 +5505,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 ; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -5515,7 +5534,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5525,36 +5544,36 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 736(%rdi), %xmm4 -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 720(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 752(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5564,80 +5583,79 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm13 ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 640(%rdi), %xmm7 -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] +; SSE-NEXT: movdqa 640(%rdi), %xmm5 +; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa 624(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[1,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 624(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] @@ -5650,54 +5668,56 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] @@ -5709,137 +5729,135 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 @@ -5851,13 +5869,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] @@ -5880,15 +5897,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 @@ -5908,154 +5925,156 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] @@ -6066,16 +6085,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -6087,19 +6106,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 @@ -6107,50 +6124,51 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6159,57 +6177,57 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -6219,35 +6237,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm3 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -6257,56 +6275,55 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1],mem[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm2 @@ -6317,55 +6334,54 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6377,15 +6393,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6396,134 +6412,135 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r9) -; SSE-NEXT: movdqa %xmm9, 96(%r9) -; SSE-NEXT: movdqa %xmm11, 80(%r9) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm7, 112(%r9) +; SSE-NEXT: movdqa %xmm8, 96(%r9) +; SSE-NEXT: movdqa %xmm15, 80(%r9) ; SSE-NEXT: movdqa %xmm12, 64(%r9) -; SSE-NEXT: movdqa %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 112(%rax) -; SSE-NEXT: movdqa %xmm2, 96(%rax) -; SSE-NEXT: movdqa %xmm5, 80(%rax) -; SSE-NEXT: movdqa %xmm13, 64(%rax) -; SSE-NEXT: movdqa %xmm14, 48(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: addq $1160, %rsp # imm = 0x488 +; SSE-NEXT: movdqa %xmm14, 112(%rax) +; SSE-NEXT: movdqa %xmm3, 96(%rax) +; SSE-NEXT: movdqa %xmm4, 80(%rax) +; SSE-NEXT: movdqa %xmm10, 64(%rax) +; SSE-NEXT: movdqa %xmm11, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf64: @@ -6549,27 +6566,28 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 @@ -6579,315 +6597,317 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm15 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6895,115 +6915,115 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm11[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -7014,129 +7034,132 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 @@ -7146,9 +7169,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7157,9 +7180,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -7172,21 +7195,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7217,15 +7239,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] @@ -7236,9 +7258,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -7246,18 +7268,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7271,13 +7293,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] @@ -7337,7 +7360,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7369,7 +7392,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -7398,10 +7421,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 @@ -7474,7 +7497,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: subq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 @@ -7483,123 +7506,118 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm6[1],ymm13[2,3,4,5],ymm6[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2,3,4,5],ymm10[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm2[1],xmm13[2,3],xmm2[4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3,4,5],ymm15[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm11 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7607,181 +7625,184 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[1],ymm11[2,3,4,5],mem[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3],xmm14[4,5],xmm15[6],xmm14[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3,4,5],ymm8[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3],xmm10[4,5],xmm11[6],xmm10[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3,4,5],ymm9[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1],xmm9[2],xmm5[3],xmm9[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2],xmm10[3],xmm0[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5],xmm7[6],xmm10[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -7790,303 +7811,305 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,3,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1,2],xmm6[3],xmm11[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2],xmm8[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm14[4],xmm4[5,6],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2],xmm1[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r8) +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm13, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rax) -; AVX2-SLOW-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-SLOW-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 @@ -8095,288 +8118,281 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -8387,256 +8403,253 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] +; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8665,28 +8678,28 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 @@ -8695,548 +8708,538 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm15, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -9265,35 +9268,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] @@ -9304,12 +9307,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -9320,9 +9323,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -9345,7 +9348,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 @@ -9367,120 +9370,120 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3],xmm11[4,5],xmm13[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -9489,8 +9492,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] @@ -9500,9 +9503,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -9513,63 +9516,63 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -9594,8 +9597,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9612,15 +9615,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 @@ -9646,12 +9650,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] @@ -9664,11 +9668,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] @@ -9684,7 +9688,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] @@ -9695,226 +9699,228 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm19[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm26[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm8, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX512F-ONLY-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-FAST-NEXT: subq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9922,11 +9928,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -9937,10 +9943,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9951,42 +9957,42 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9995,21 +10001,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10017,47 +10023,47 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 @@ -10066,401 +10072,401 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm25, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, (%rsp), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3],xmm8[4],xmm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm1[4],xmm9[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 @@ -10468,26 +10474,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 @@ -10515,8 +10521,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -10525,12 +10531,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-FAST-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -10555,11 +10561,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -10571,8 +10577,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] @@ -10592,7 +10599,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 @@ -10628,51 +10635,51 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] @@ -10692,20 +10699,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10716,20 +10723,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5] @@ -10742,20 +10749,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -10766,41 +10773,41 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 @@ -10808,25 +10815,25 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] @@ -10837,8 +10844,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] @@ -10847,8 +10854,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -10872,11 +10879,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm20, %zmm2 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] @@ -10887,7 +10894,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -10899,21 +10906,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] @@ -10929,7 +10936,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] @@ -10938,178 +10945,178 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm13, %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm8, %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm14, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm8, %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) @@ -11120,16 +11127,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) @@ -11140,7 +11147,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX512DQ-FAST-NEXT: subq $904, %rsp # imm = 0x388 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11148,11 +11155,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -11160,61 +11167,61 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm5[1],ymm2[2,3,4,5],ymm5[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 @@ -11225,115 +11232,115 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3],xmm3[4,5],xmm13[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3],xmm2[4,5],xmm10[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm7 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] @@ -11349,362 +11356,361 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm27, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm24 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm4[4],xmm15[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm4[4],xmm9[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm4[4],xmm11[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm15, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 @@ -11721,18 +11727,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FAST-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX512DQ-FAST-NEXT: addq $904, %rsp # imm = 0x388 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 8dbd67be7769e..19530e02a99af 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -301,77 +301,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: psrlq $16, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: psrlq $16, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: movq %xmm7, (%rcx) ; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm5, (%r9) -; SSE-NEXT: movq %xmm9, (%rdi) -; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: movq %xmm6, (%r9) +; SSE-NEXT: movq %xmm10, (%rdi) +; SSE-NEXT: movq %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf4: @@ -734,183 +734,167 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: andnps %xmm8, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: andnps %xmm5, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[0,3,2,3] +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm14[0],xmm10[1,2,3] -; SSE-NEXT: andps %xmm5, %xmm10 -; SSE-NEXT: orps %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] ; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3] +; SSE-NEXT: andps %xmm3, %xmm11 +; SSE-NEXT: orps %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm12[0],xmm2[1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: andps %xmm3, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: psrlq $16, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm11[0,2] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm10, (%rcx) -; SSE-NEXT: movdqa %xmm5, (%r8) -; SSE-NEXT: movapd %xmm2, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movdqa %xmm3, (%r8) +; SSE-NEXT: movapd %xmm13, (%r9) ; SSE-NEXT: movaps %xmm14, (%rdi) -; SSE-NEXT: movapd %xmm0, (%rax) +; SSE-NEXT: movapd %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf8: @@ -1473,339 +1457,339 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $232, %rsp ; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm13 ; SSE-NEXT: movdqa 176(%rdi), %xmm15 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: orps %xmm3, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm11 +; SSE-NEXT: orps %xmm4, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm5, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: andnps %xmm1, %xmm15 +; SSE-NEXT: orps %xmm0, %xmm15 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: psrld $16, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: psrlq $16, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1814,59 +1798,59 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm9, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movapd %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: movapd %xmm11, (%rax) +; SSE-NEXT: movapd %xmm0, 16(%rax) +; SSE-NEXT: addq $232, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $248, %rsp +; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -1876,294 +1860,291 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[2],xmm2[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm6[6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0],xmm0[1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1],xmm10[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5],xmm0[6],xmm13[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0],xmm6[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1,2],xmm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm8[1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm4[1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $248, %rsp +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2349,16 +2330,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] @@ -2368,8 +2349,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] @@ -2380,7 +2361,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] @@ -2388,8 +2369,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] @@ -2399,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] @@ -2410,7 +2391,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] @@ -2435,30 +2416,30 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2],xmm15[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4,5,6,7],ymm9[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3,4,5,6,7],ymm5[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7],ymm6[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 @@ -2468,19 +2449,19 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1,2,3,4,5,6,7],ymm12[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -2501,7 +2482,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -2523,30 +2504,30 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3,4,5],xmm9[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] @@ -2554,18 +2535,18 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] @@ -2574,8 +2555,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm12[1,2,3,4,5,6,7],ymm9[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] @@ -2610,10 +2591,10 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7] @@ -2636,16 +2617,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2655,18 +2636,18 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1,2,3,4,5,6,7],ymm1[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -2807,17 +2788,17 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2829,7 +2810,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx) @@ -2837,7 +2818,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm11, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2849,9 +2830,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -2861,8 +2842,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2900,14 +2881,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -2915,65 +2896,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -2984,11 +2965,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -3000,9 +2981,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -3012,8 +2993,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -3051,14 +3032,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -3066,65 +3047,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -3135,11 +3116,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -3232,136 +3213,137 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 320(%rdi), %xmm5 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3371,313 +3353,311 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm8 +; SSE-NEXT: orps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: orps %xmm4, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm14 +; SSE-NEXT: orps %xmm4, %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: orps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,1] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3685,60 +3665,89 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -3746,27 +3755,28 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] @@ -3778,271 +3788,241 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,6,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, (%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, (%rax) -; SSE-NEXT: movapd %xmm1, 48(%rax) -; SSE-NEXT: movapd %xmm3, 32(%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm3, 48(%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) +; SSE-NEXT: movapd %xmm5, 16(%rax) ; SSE-NEXT: addq $600, %rsp # imm = 0x258 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4054,16 +4034,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 @@ -4074,12 +4055,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm10[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 @@ -4092,12 +4072,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -4114,22 +4094,21 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm8[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] @@ -4145,662 +4124,673 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm15[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm10[0,1,2,3,4,5],mem[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm8, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm3, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3,4,5],mem[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm7[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm10, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, (%rsp), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3,4,5],xmm2[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[0,1,2,3,4,5],mem[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm14[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm9[6],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm10[1],mem[0],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm12[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-SLOW-LABEL: load_i16_stride7_vf32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-SLOW-LABEL: load_i16_stride7_vf32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm9[3],ymm13[4,5],ymm9[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7,8,9,10,11],ymm4[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -4809,10 +4799,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -4821,21 +4811,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -4844,105 +4833,107 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm9[2],mem[3,4,5],ymm9[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7,8,9,10,11,12,13],ymm4[14],ymm8[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm10[2],ymm8[3,4,5],ymm10[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6],ymm4[7,8,9,10,11,12,13],ymm1[14],ymm4[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2],xmm1[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] @@ -4950,92 +4941,94 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5],xmm12[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6],ymm9[7,8],ymm7[9,10,11,12,13,14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4],xmm3[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5059,17 +5052,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 @@ -5099,11 +5092,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 @@ -5116,21 +5110,22 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u> @@ -5139,19 +5134,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] @@ -5159,669 +5155,688 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5],xmm15[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4,5,6,7],ymm15[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm9[1],xmm15[2],xmm9[3],xmm15[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3,4,5,6,7],ymm5[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7,8,9,10,11],ymm4[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6,7],ymm15[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm12 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4,5],ymm5[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2],xmm5[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1,2,3,4,5,6,7],ymm4[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3],xmm4[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7,8],ymm7[9,10,11,12,13,14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7,8],ymm7[9],ymm6[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm8 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3],xmm6[4],xmm1[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5842,11 +5857,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5855,670 +5870,661 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: pushq %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4],xmm2[5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6,7,8,9,10],ymm2[11],ymm8[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm13[1],xmm5[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm18[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm14[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm13[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm13, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7,8,9,10,11,12,13],ymm8[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7,8],ymm3[9,10,11,12,13,14],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2],xmm7[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4,5],ymm6[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3,4,5,6],ymm11[7,8],ymm2[9,10,11,12,13,14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5],xmm10[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm11[4],xmm2[5],xmm11[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2,3],xmm9[4],xmm12[5],xmm9[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7,8,9,10,11,12,13],ymm7[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3],ymm9[4,5,6,7,8,9,10],ymm12[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7],ymm9[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5],xmm12[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4],ymm12[5,6,7,8,9,10,11],ymm14[12],ymm12[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2],xmm14[3],xmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7,8,9,10],ymm13[11],ymm10[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm13, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4],ymm10[5,6,7,8,9,10,11],ymm14[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4,5,6,7,8],ymm14[9],ymm12[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6,7,8],ymm9[9],ymm4[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2],xmm8[3],xmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5],ymm13[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm6, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: popq %rax ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,u,u,u,5,8,12,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm22, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm13[4],xmm10[5],xmm13[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm10, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm11, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3],xmm14[4],xmm13[5],xmm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4],xmm11[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm10[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3,4,5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm26, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm19, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm28[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm12[4],xmm2[5],xmm12[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm14, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm10, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm2[2],ymm8[3,4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3,4,5],xmm9[6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7],ymm10[8,9,10,11,12],ymm13[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm12[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm12, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm6[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7],ymm10[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm27, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm30, %zmm14 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm27, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm30, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm12, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -6529,661 +6535,669 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm12 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2],xmm2[3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm5[1],xmm15[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm8[2],ymm6[3,4,5],ymm8[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm11[1],xmm13[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm17[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,1,4,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm13, %xmm2 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm11, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm31 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7,8,9,10,11,12,13],ymm3[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6],ymm12[7,8],ymm3[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7,8],ymm12[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm1[2],ymm10[3,4,5],ymm1[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm24 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6],ymm0[7,8,9,10,11,12,13],ymm10[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6,7,8,9,10],ymm9[11],ymm12[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2,3,4,5],xmm5[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3,4,5],xmm9[6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7,8,9,10,11],ymm10[12],ymm9[13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4],ymm3[5,6,7,8,9,10,11],ymm10[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6,7,8],ymm8[9],ymm5[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2,3,4,5,6],ymm10[7,8],ymm2[9,10,11,12,13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5,6,7],ymm10[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1,2],ymm7[3,4,5,6,7],ymm1[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm25, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rdx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15> +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <1,u,u,u,5,8,12,15> ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm27 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm27[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm15[4],xmm12[5],xmm15[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm7, %ymm10, %ymm22 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3],xmm11[4],xmm15[5],xmm11[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm11, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4],xmm10[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4,5,6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm10 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm26 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4],xmm11[5],xmm5[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0],xmm0[1],xmm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5],xmm11[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm21, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm14, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm10, %ymm11, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm11, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm10, %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm31, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,11,14,u,u,u> ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2],xmm7[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7],ymm11[8,9,10,11,12],ymm13[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm13, %zmm13 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm26 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7330,169 +7344,169 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1352, %rsp # imm = 0x548 -; SSE-NEXT: movdqa 640(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movdqa 640(%rdi), %xmm9 +; SSE-NEXT: movdqa 624(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 608(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 592(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 560(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa 576(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 496(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 464(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 464(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7502,31 +7516,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 864(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 864(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movaps 832(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7536,31 +7550,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7570,31 +7584,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 720(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7604,1079 +7618,1066 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm9[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: orps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: orps %xmm9, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm2[0],xmm9[1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: orps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm6 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: andps %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8684,10 +8685,30 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] @@ -8763,7 +8784,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] @@ -8773,164 +8794,145 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm14, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 112(%rax) -; SSE-NEXT: movaps %xmm11, 96(%rax) -; SSE-NEXT: movaps %xmm14, 80(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm11, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm13, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 112(%rax) -; SSE-NEXT: movapd %xmm1, 96(%rax) -; SSE-NEXT: movapd %xmm2, 80(%rax) -; SSE-NEXT: movapd %xmm5, 64(%rax) -; SSE-NEXT: movapd %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: movapd %xmm4, 96(%rax) +; SSE-NEXT: movapd %xmm5, 80(%rax) +; SSE-NEXT: movapd %xmm6, 64(%rax) +; SSE-NEXT: movapd %xmm7, 48(%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $1352, %rsp # imm = 0x548 ; SSE-NEXT: retq ; @@ -8959,18 +8961,18 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 @@ -8981,10 +8983,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] @@ -8998,7 +9000,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 @@ -9020,15 +9022,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9060,9 +9062,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9072,25 +9074,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm4[2],zero +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] @@ -9120,11 +9122,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -9136,12 +9138,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9179,19 +9180,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] @@ -9201,6 +9204,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] @@ -9208,14 +9212,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] @@ -9224,12 +9228,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] @@ -9244,8 +9248,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9257,8 +9261,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm14[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] @@ -9267,25 +9272,26 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm12[0],mem[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9293,44 +9299,43 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm8[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9351,63 +9356,64 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload @@ -9418,146 +9424,230 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm0[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm10, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload @@ -9565,265 +9655,176 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0],mem[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm12[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0],xmm1[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm1[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[0],mem[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9854,12 +9855,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -9867,268 +9868,229 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm11[1],xmm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm13[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm14[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload @@ -10138,14 +10100,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10167,25 +10129,65 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10221,24 +10223,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) ; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -10247,397 +10249,394 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm9[2],ymm14[3,4,5],ymm9[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm12[2],ymm8[3,4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5],ymm3[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm12[1],ymm8[2,3,4],ymm12[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm8[2],ymm12[3,4,5],ymm8[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5],xmm13[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -10645,2153 +10644,2157 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4,5],ymm5[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2],ymm7[3,4,5],ymm15[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7,8,9,10,11,12,13],ymm3[14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm12[2,3],mem[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7],ymm2[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm9[2,3],ymm14[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5],xmm10[6],xmm14[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm5[3],mem[4,5],ymm5[6],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7,8],ymm5[9,10,11,12,13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6,7,8],ymm4[9],ymm0[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[3],xmm5[4],xmm0[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7],ymm5[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7,8],ymm6[9],ymm3[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4],xmm6[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4],xmm2[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm14[1],xmm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6,7,8],ymm6[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,1,u,4,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,1,u,5,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1512, %rsp # imm = 0x5E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm7[2],ymm11[3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5],xmm13[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm9[2],ymm3[3,4,5],ymm9[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm15[2],ymm12[3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7,8,9,10,11,12,13],ymm4[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm3[1],xmm13[2],xmm3[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4,5,6,7],ymm7[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7,8,9,10,11,12,13],ymm4[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6],ymm4[7,8],ymm1[9,10,11,12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm1[1,2,3,4,5,6],ymm4[7,8],ymm1[9,10,11,12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7],ymm12[8,9,10,11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm12[1,2,3,4,5,6,7],ymm7[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3],xmm5[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5],xmm7[6],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2],ymm5[3],mem[4,5],ymm5[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7,8],ymm5[9,10,11,12,13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4,5,6,7,8],ymm7[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7,8],ymm4[9],ymm1[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6,7,8],ymm7[9],ymm4[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1512, %rsp # imm = 0x5E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> @@ -12799,8 +12802,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 @@ -12809,114 +12812,111 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm22[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm19[0,1,0,2] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6,7,8,9,10],ymm0[11],ymm5[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm2[2],ymm13[3,4,5],ymm2[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6,7,8,9,10],ymm8[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm2[2],ymm11[3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] @@ -12925,1808 +12925,1813 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm22[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm22[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,1,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm20[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm1[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm18, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm5[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm27[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7,8,9,10,11,12,13],ymm4[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm27[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3,4,5,6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4,5,6,7,8],ymm0[9],ymm8[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm27[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6,7,8],ymm0[9],ymm6[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4,5,6,7],ymm11[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7],ymm1[8,9,10],ymm8[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7],ymm6[8,9,10],ymm11[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6],ymm8[7,8,9,10,11,12,13],ymm0[14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5,6,7,8,9,10],ymm14[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5],ymm0[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3,4,5],xmm0[6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7,8,9,10],ymm15[11],ymm14[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm22[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7,8,9,10],ymm11[11],ymm6[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm6, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2],xmm6[3],xmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4],ymm0[5,6,7,8,9,10,11],ymm13[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6],ymm12[7,8],ymm0[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7,8],ymm12[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4,5],mem[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7,8],ymm15[9],ymm14[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6,7,8],ymm15[9],ymm8[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm27, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm18 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm21, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: subq $1768, %rsp # imm = 0x6E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3,4,5],xmm6[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2,3,4,5],xmm4[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm11, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm11, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm11, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3,4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2],xmm8[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm19, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm30[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,3,3,0,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm13, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm25, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4],xmm5[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3],xmm12[4],xmm3[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4,5,6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,u,u,u,4,8,11,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3,4,5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm15, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4,5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm12[2],ymm5[3,4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,2,11,12,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3,4,5],xmm1[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3,4,5],xmm14[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm18, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm13 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX512F-ONLY-FAST-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-SLOW-NEXT: subq $1560, %rsp # imm = 0x618 ; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm27 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm16[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,1,0,2] ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,1,1,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm1[1],xmm15[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm11 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6,7,8,9,10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm16[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm9 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm7, %ymm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm16[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm22[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4],xmm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm0[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm22, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3],xmm2[4],xmm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm20, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm15 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm14[2,3],ymm3[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm16 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm7[1],xmm4[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm18 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 @@ -14734,1249 +14739,1250 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6],ymm3[7,8,9,10,11,12,13],ymm12[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5,6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm30[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7,8,9,10,11,12,13],ymm13[14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4,5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3,4,5,6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm29[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6,7,8],ymm1[9],ymm12[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1,2,3,4,5,6],ymm13[7,8],ymm11[9,10,11,12,13,14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4,5,6,7,8],ymm0[9],ymm9[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm4[2],ymm1[3,4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm31 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6,7,8],ymm1[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm30 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm14[2],ymm9[3,4,5],ymm14[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm11, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm28 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm10[2],ymm2[3,4,5],ymm10[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm16 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6],ymm11[7,8,9,10,11,12,13],ymm1[14],ymm11[15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1],ymm12[2,3],ymm0[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm22[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm24 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm28, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm30 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm21[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm29 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7,8,9,10],ymm13[11],ymm12[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5,6,7,8,9,10,11],ymm11[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4],ymm7[5,6,7,8,9,10,11],ymm12[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0,1,2],ymm12[3,4,5,6,7],ymm7[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7,8],ymm8[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3,4,5],xmm9[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5],ymm14[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6,7,8,9,10],ymm11[11],ymm9[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm13 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm29 ; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm23[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm13[2],ymm6[3,4,5],ymm13[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3,4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm3[1],xmm13[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm27 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm7, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm14[1],xmm7[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, (%rsp) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm3[1],xmm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm10, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm10, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3,4,5],xmm3[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3,4,5],xmm8[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2],xmm4[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm20, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,3,3,0,3,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm31, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm2, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4],xmm2[5],xmm9[6,7] ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm30, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3],xmm15[4],xmm5[5],xmm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3],xmm15[4],xmm8[5],xmm15[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,u,4,8,11,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3],xmm6[4],xmm15[5],xmm6[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3,4,5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3,4,5,6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,u,u,u,5,8,12,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm27 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm27 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3,4,5],xmm1[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm21 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm19 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2,3,4,5],xmm8[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm29 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3,4,5],xmm15[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5],xmm15[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm12, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm19, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm8[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FAST-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index e88f9e1ebee09..91eba6c880376 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -535,18 +535,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm3, %xmm6 +; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm0, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 @@ -563,99 +563,100 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm2, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15 -; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm15 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm15 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm5, %xmm6, %xmm14 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm6 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm7, %xmm9 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm10, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm11, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rax) +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rax) ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride8_vf8: @@ -715,74 +716,75 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa 240(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm12 ; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movdqa %xmm2, %xmm15 @@ -792,113 +794,114 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm12, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm4, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm14, (%rax) @@ -930,8 +933,8 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -939,25 +942,25 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] @@ -965,114 +968,114 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $152, %rsp @@ -1081,98 +1084,97 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $232, %rsp +; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm9, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1200,47 +1202,48 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,1,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -1249,12 +1252,12 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) @@ -1266,10 +1269,10 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $232, %rsp +; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1287,49 +1290,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm7, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm16[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm20[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm23 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1339,26 +1342,26 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm14, %xmm17, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 @@ -1368,76 +1371,76 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm16[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm20[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm21[0,1,1,3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1445,164 +1448,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm7, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm13 +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm17[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm20[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm16 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] +; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm19 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm19, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm7 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm18[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm20[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm5, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm5, %xmm15 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm11 +; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm12 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm19, %xmm5 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm18, %xmm5 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1702,11 +1705,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: subq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 480(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm3 @@ -1720,7 +1723,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] @@ -1743,372 +1746,372 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa 448(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa 384(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2142,29 +2145,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 48(%rax) -; SSE-NEXT: movapd %xmm11, 32(%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) +; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $856, %rsp # imm = 0x358 +; AVX1-ONLY-NEXT: subq $872, %rsp # imm = 0x368 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 @@ -2174,8 +2176,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -2192,31 +2194,31 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2226,9 +2228,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 @@ -2247,8 +2249,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -2266,90 +2269,86 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2360,8 +2359,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] @@ -2370,91 +2369,91 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2463,104 +2462,106 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] @@ -2591,19 +2592,19 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $856, %rsp # imm = 0x358 +; AVX1-ONLY-NEXT: addq $872, %rsp # imm = 0x368 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 +; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 @@ -2618,15 +2619,14 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -2635,16 +2635,18 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2652,12 +2654,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2685,145 +2687,148 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm14 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm12 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2839,62 +2844,60 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm15 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm9 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2902,244 +2905,239 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm1, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 @@ -3150,118 +3148,122 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm26 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm26[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm25[0,1,0,2] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm27 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm28 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm9, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm5[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm6, %xmm8, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -3272,178 +3274,182 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm20 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vpermi2d %xmm3, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm7 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm26[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm27[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm28[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm26[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm3[1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm16[0],xmm5[1],xmm16[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm18[2],xmm1[3],xmm18[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm12, %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm16[2],xmm5[3],xmm16[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -3454,43 +3460,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3500,25 +3505,26 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -3528,56 +3534,57 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm25 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm25[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm26 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm18[0],xmm5[0],xmm18[1],xmm5[1] +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] @@ -3585,245 +3592,246 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm16[2],xmm7[3],xmm16[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm29, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm29, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm27, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm27 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm23 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm22 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm29 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm26 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm30 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm20 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm19 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm26[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm8, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5] +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm8, %xmm4 -; AVX512F-FAST-NEXT: vpermi2d %xmm16, %xmm9, %xmm4 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm31[0],xmm17[1],xmm31[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpermi2d %xmm16, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm23[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermi2d %xmm31, %xmm17, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm29, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm29, %xmm9 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm31[2],xmm17[3],xmm31[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermi2d %xmm22, %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm27, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm27, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r9) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3978,41 +3986,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: subq $1800, %rsp # imm = 0x708 ; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -4046,16 +4055,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4081,13 +4091,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 448(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4113,17 +4122,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 944(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 928(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4144,29 +4153,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 320(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 864(%rdi), %xmm1 @@ -4175,87 +4185,92 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 832(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 768(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4263,19 +4278,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4283,8 +4298,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4293,137 +4308,140 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movapd %xmm8, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4435,43 +4453,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4484,32 +4501,33 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4526,63 +4544,63 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4594,10 +4612,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4615,63 +4631,61 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4680,29 +4694,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -4711,35 +4724,37 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4749,30 +4764,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movaps %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4783,14 +4798,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -4805,14 +4821,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4829,14 +4847,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] @@ -4932,8 +4949,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm12, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -4960,7 +4977,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm8, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: addq $1800, %rsp # imm = 0x708 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf64: @@ -4970,15 +4987,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -4990,38 +5005,38 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5031,9 +5046,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5046,21 +5061,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 @@ -5073,14 +5088,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5159,16 +5175,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5193,56 +5209,59 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm11[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] @@ -5267,91 +5286,89 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm11[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5363,8 +5380,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload @@ -5384,78 +5400,78 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -5464,13 +5480,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5481,9 +5495,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5495,7 +5508,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5508,7 +5522,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -5542,55 +5556,55 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5599,49 +5613,51 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm10[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[0],mem[0],xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] @@ -5659,28 +5675,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5689,72 +5705,73 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -5765,8 +5782,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5776,58 +5793,58 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5897,7 +5914,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 @@ -5926,7 +5943,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] @@ -5985,9 +6002,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5995,12 +6012,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6008,13 +6025,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6040,14 +6056,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 @@ -6060,7 +6076,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 @@ -6068,84 +6084,84 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm10[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6160,31 +6176,31 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -6192,15 +6208,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6215,83 +6231,84 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] @@ -6300,17 +6317,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6326,9 +6343,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6337,7 +6354,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -6347,8 +6365,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6357,30 +6376,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -6389,21 +6408,23 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6461,12 +6482,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6474,11 +6493,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6497,13 +6515,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6511,12 +6528,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6524,299 +6539,293 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload @@ -6828,7 +6837,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -6879,7 +6888,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -6887,50 +6896,50 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-ONLY-NEXT: addq $2472, %rsp # imm = 0x9A8 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2424, %rsp # imm = 0x978 -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6942,7 +6951,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] @@ -6953,12 +6962,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] @@ -6966,16 +6975,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6983,22 +6992,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] @@ -7009,357 +7018,351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 880(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 848(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 816(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa 784(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm15[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] -; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm30[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm22[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm28[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm20[2],xmm10[2],xmm20[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-SLOW-NEXT: vpshufd $212, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm30[2],xmm2[2],xmm30[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm7[2],xmm22[3],xmm7[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm27 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm28[2],xmm21[2],xmm28[3],xmm21[3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm24 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-SLOW-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm25, %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $212, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm27, %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm24, %xmm16, %xmm0 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm16, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm15[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7373,21 +7376,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] @@ -7430,7 +7433,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -7450,22 +7453,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm30 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7478,292 +7480,304 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm26 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm28 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm31 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm7 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm7[1],xmm12[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm12 = xmm12[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm20[0],xmm25[0],xmm20[1],xmm25[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm21[2],xmm9[2],xmm21[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm20 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm30[2],xmm7[2],xmm30[3],xmm7[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm28[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm25[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm14[2],xmm26[2],xmm14[3],xmm26[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm16[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm29 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm2 -; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm16, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm16, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm13[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm26, %xmm16, %xmm14 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm25[2],xmm10[3],xmm25[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm30, %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm29, %xmm16, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -7796,13 +7810,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-SLOW-NEXT: addq $2424, %rsp # imm = 0x978 +; AVX512F-SLOW-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2392, %rsp # imm = 0x958 +; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 @@ -7812,12 +7826,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7830,8 +7843,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 @@ -7873,15 +7887,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 @@ -7891,12 +7906,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7906,11 +7919,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7920,10 +7933,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7933,352 +7946,351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa 816(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 800(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa 992(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 928(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 592(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 ; AVX512F-FAST-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm11, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm16[0],xmm20[0],xmm16[1],xmm20[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm13, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm31[0],xmm15[0],xmm31[1],xmm15[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm17 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm13, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm25[0],xmm23[0],xmm25[1],xmm23[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm18 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm19 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-FAST-NEXT: vpermt2d %xmm26, %xmm13, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm28[0],xmm19[0],xmm28[1],xmm19[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm27 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm28 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm15, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 (%rsp), %xmm17 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm19[2],xmm17[2],xmm19[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm23 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm18[2],xmm23[2],xmm18[3],xmm23[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm11[2],xmm23[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm13, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm18, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm7, %xmm18, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8297,11 +8309,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -8333,8 +8344,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -8345,13 +8356,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] @@ -8364,7 +8374,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3] @@ -8394,28 +8404,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] @@ -8423,13 +8430,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -8437,50 +8444,52 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1] +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -8495,14 +8504,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm23[0],xmm8[0],xmm23[1],xmm8[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] @@ -8511,1379 +8520,1351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm22[0],xmm26[0],xmm22[1],xmm26[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-FAST-NEXT: vpermi2d %xmm28, %xmm25, %xmm6 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm24[0],xmm2[0],xmm24[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vpermi2d %xmm28, %xmm30, %xmm6 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm16[2],xmm17[2],xmm16[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm18 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm23[2],xmm1[2],xmm23[3],xmm1[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm1, %xmm0, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm29[2],xmm18[2],xmm29[3],xmm18[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm22[2],xmm0[2],xmm22[3],xmm0[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm0, %xmm1, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm19[2],xmm17[2],xmm19[3],xmm17[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm13, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 +; AVX512F-FAST-NEXT: vpermi2d %xmm7, %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm22 -; AVX512F-FAST-NEXT: vpermi2d %xmm20, %xmm24, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm23[2],xmm10[2],xmm23[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm24, %xmm1 -; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm0[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm24, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm24, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm17, %xmm4 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm24, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm22[2],xmm20[2],xmm22[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%r8) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%r9) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: addq $2392, %rsp # imm = 0x958 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index b420710a4bbfd..b85824d860c84 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -276,33 +276,33 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm9 ; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm12 +; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm12 ; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm15 ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3] +; SSE-NEXT: movaps %xmm6, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm12[1,3] -; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm13[1,3] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm10[1,3] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm11[1,3] +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm9[1,3] ; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] @@ -312,13 +312,13 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm10, 96(%rsi) -; SSE-NEXT: movaps %xmm12, 32(%rsi) -; SSE-NEXT: movaps %xmm13, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps %xmm11, 96(%rsi) +; SSE-NEXT: movaps %xmm14, 32(%rsi) +; SSE-NEXT: movaps %xmm12, 112(%rsi) +; SSE-NEXT: movaps %xmm10, 48(%rsi) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps %xmm13, 80(%rsi) ; SSE-NEXT: movaps %xmm9, 16(%rsi) ; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps %xmm5, 112(%rdx) @@ -432,61 +432,61 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm10[1,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm15[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm15[1,3] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm1[1,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm14[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm7[1,3] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm4[1,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm2[1,3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm11[1,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm12[1,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm15 @@ -496,46 +496,46 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3] ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm0[1,3] ; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm0[1,3] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm0[1,3] +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm1[1,3] +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm2[1,3] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[1,3] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, 224(%rsi) +; SSE-NEXT: movaps %xmm11, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) +; SSE-NEXT: movaps %xmm6, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -547,17 +547,17 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps %xmm14, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) ; SSE-NEXT: movaps %xmm13, 160(%rdx) ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -576,141 +576,141 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm12[0,2],ymm7[4,6],ymm12[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2],ymm14[0,2],ymm5[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3],ymm12[1,3],ymm7[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3],ymm14[1,3],ymm5[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm14[0,2],ymm2[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm14[1,3],ymm2[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,2],ymm12[0,2],ymm4[4,6],ymm12[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm12[1,3],ymm4[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,2],ymm11[0,2],ymm6[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm11[1,3],ymm6[5,7],ymm11[5,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,2],ymm4[0,2],ymm7[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,3],ymm4[1,3],ymm7[5,7],ymm4[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,2],ymm5[0,2],ymm8[4,6],ymm5[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3],ymm5[1,3],ymm8[5,7],ymm5[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm6[0,2],ymm9[4,6],ymm6[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,3],ymm6[1,3],ymm9[5,7],ymm6[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,3],ymm2[1,3],ymm3[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 2829c15ed8256..f6a81c61b87a7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -213,40 +213,40 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf8: @@ -407,72 +407,73 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm15 ; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] ; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] ; SSE-NEXT: movaps %xmm8, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] @@ -483,13 +484,13 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm6, 48(%rdx) ; SSE-NEXT: movaps %xmm7, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rdx) ; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: retq ; @@ -731,238 +732,240 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $392, %rsp # imm = 0x188 +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm6 ; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm2 ; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps %xmm2, 96(%rdx) -; SSE-NEXT: movaps %xmm10, 32(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps %xmm7, 112(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm11, 64(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm14, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps %xmm13, 64(%rcx) -; SSE-NEXT: movaps %xmm7, 80(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps %xmm5, 48(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: movaps %xmm15, 64(%rcx) +; SSE-NEXT: movaps %xmm13, 80(%rcx) +; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) +; SSE-NEXT: addq $392, %rsp # imm = 0x188 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -970,233 +973,236 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm4[1,3],ymm1[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm10[3,0],ymm8[6,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,0],ymm0[2,0],ymm10[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm0[2,0],ymm12[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm9[0,3],ymm7[6,4],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps $196, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,3],ymm1[4,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $104, %rsp -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $136, %rsp +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: addq $104, %rsp +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: addq $136, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1205,81 +1211,81 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: subq $104, %rsp ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload @@ -1289,119 +1295,123 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1454,47 +1464,47 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1064, %rsp # imm = 0x428 +; SSE-NEXT: subq $1112, %rsp # imm = 0x458 ; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm4 +; SSE-NEXT: movaps 656(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm6 +; SSE-NEXT: movaps 640(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 432(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1503,227 +1513,231 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 400(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 384(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 576(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 560(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 560(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 528(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 528(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 752(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 720(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 720(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0] +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps 304(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm13[1,0] +; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 512(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[1,0] -; SSE-NEXT: movaps 480(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[1,0] -; SSE-NEXT: movaps 672(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] +; SSE-NEXT: movaps 480(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 688(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0] +; SSE-NEXT: movaps 672(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1732,13 +1746,12 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload @@ -1746,11 +1759,18 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -1758,11 +1778,6 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -1770,13 +1785,12 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] @@ -1788,16 +1802,15 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] @@ -1815,8 +1828,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1856,8 +1868,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm4, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm12, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) +; SSE-NEXT: movaps %xmm14, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -1895,22 +1908,21 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, 144(%rcx) ; SSE-NEXT: movaps %xmm8, 128(%rcx) ; SSE-NEXT: movaps %xmm9, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 80(%rcx) -; SSE-NEXT: movaps %xmm11, 64(%rcx) +; SSE-NEXT: movaps %xmm10, 96(%rcx) +; SSE-NEXT: movaps %xmm11, 80(%rcx) +; SSE-NEXT: movaps %xmm12, 64(%rcx) ; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $1064, %rsp # imm = 0x428 +; SSE-NEXT: addq $1112, %rsp # imm = 0x458 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1942,8 +1954,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1952,8 +1964,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1967,87 +1979,87 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2057,71 +2069,68 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0],ymm0[2,0],ymm5[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[3,0],ymm0[6,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0],ymm0[2,0],ymm13[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm12[3,0],ymm4[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm1[2,0],ymm12[4,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm7[3,0],ymm6[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0],ymm2[2,0],ymm7[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,0],ymm7[3,0],ymm12[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,0],ymm5[2,0],ymm7[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2134,831 +2143,831 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,0],ymm14[2,0],ymm3[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,0],ymm0[2,0],ymm13[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1],mem[0,3],ymm8[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[0,3],ymm0[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm1[2,0],ymm9[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[0,3],ymm1[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[0,3],ymm2[6,4],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[0,3],ymm4[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm5[2],ymm11[3,4],ymm5[5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm11[0,3],ymm5[6,4],ymm11[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm12[0,3],ymm6[4,5],ymm12[4,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm9, %ymm13 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm15, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm14, 32(%rcx) -; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 980312630759a..eba201399b0b9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -420,109 +420,106 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 240(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) -; SSE-NEXT: movaps %xmm6, 32(%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rdx) +; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm12, 48(%r8) -; SSE-NEXT: movaps %xmm14, 32(%r8) -; SSE-NEXT: movaps %xmm10, 16(%r8) -; SSE-NEXT: movaps %xmm11, (%r8) -; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm10, 48(%r8) +; SSE-NEXT: movaps %xmm3, 32(%r8) +; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf16: @@ -702,20 +699,20 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] ; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -743,7 +740,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm5 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload @@ -753,7 +750,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -822,47 +819,47 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: movaps 272(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm10 +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 320(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm4 ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -874,8 +871,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -894,189 +891,185 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 464(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 432(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 400(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps 48(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm15, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 96(%rdx) +; SSE-NEXT: movaps %xmm14, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 64(%rdx) +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm9, 32(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm13, 64(%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps %xmm12, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 96(%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm15, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps %xmm4, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm9, 96(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm15, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) -; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf32: @@ -1087,25 +1080,25 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1131,31 +1124,32 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1163,91 +1157,92 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm11[1,0],ymm5[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[1],xmm11[1],zero,zero +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm14[1,0],ymm12[5,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm4[0],mem[0],xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[1,0],ymm1[5,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[6],ymm5[6],ymm10[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm9[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] @@ -1255,41 +1250,39 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm7[2],xmm12[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -1298,69 +1291,70 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm8[3,0],ymm5[7,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm11[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm8[3,0],ymm3[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) @@ -1370,71 +1364,70 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 @@ -1443,143 +1436,146 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1596,17 +1592,17 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1687,81 +1683,78 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; SSE-NEXT: movaps 144(%rdi), %xmm4 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps 128(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm12 ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps 336(%rdi), %xmm13 +; SSE-NEXT: movaps 320(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 272(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1782,26 +1775,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 432(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 400(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 624(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 592(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm1 @@ -1847,11 +1840,12 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 672(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 656(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 656(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1874,8 +1868,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 800(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 784(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1889,316 +1883,312 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 976(%rdi), %xmm1 +; SSE-NEXT: movaps 976(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 944(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 928(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 928(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 912(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 896(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps %xmm10, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm3, 240(%rcx) -; SSE-NEXT: movaps %xmm8, 224(%rcx) -; SSE-NEXT: movaps %xmm13, 208(%rcx) -; SSE-NEXT: movaps %xmm0, 192(%rcx) -; SSE-NEXT: movaps %xmm1, 176(%rcx) -; SSE-NEXT: movaps %xmm4, 160(%rcx) -; SSE-NEXT: movaps %xmm7, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 128(%rcx) -; SSE-NEXT: movaps %xmm12, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps %xmm12, 224(%rcx) +; SSE-NEXT: movaps %xmm1, 208(%rcx) +; SSE-NEXT: movaps %xmm2, 192(%rcx) +; SSE-NEXT: movaps %xmm3, 176(%rcx) +; SSE-NEXT: movaps %xmm4, 160(%rcx) +; SSE-NEXT: movaps %xmm5, 144(%rcx) +; SSE-NEXT: movaps %xmm6, 128(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm15, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2209,10 +2199,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm13, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) -; SSE-NEXT: movaps %xmm9, 224(%r8) +; SSE-NEXT: movaps %xmm7, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2223,11 +2213,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 160(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%r8) +; SSE-NEXT: movaps %xmm11, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -2239,41 +2228,43 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm15, (%r8) +; SSE-NEXT: movaps %xmm9, (%r8) ; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2200, %rsp # imm = 0x898 +; AVX1-ONLY-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[4],ymm9[4],ymm5[5],ymm9[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2295,11 +2286,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2319,11 +2310,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2391,11 +2382,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2409,8 +2400,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 @@ -2419,11 +2410,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 @@ -2433,40 +2424,40 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[4],ymm8[4],ymm14[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm7[1,0],ymm14[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[1,0],ymm10[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm5[1,0],ymm9[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm15[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm13[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2479,10 +2470,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2495,10 +2486,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2511,10 +2502,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2527,10 +2518,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2543,10 +2534,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2559,61 +2550,61 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[6],ymm9[6],ymm14[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm3[2],xmm11[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm6[2],xmm12[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[6],ymm11[6],ymm7[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2626,10 +2617,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2642,10 +2633,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2658,10 +2649,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2674,30 +2665,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm14[3,0],ymm1[7,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm10[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] @@ -2705,29 +2694,30 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm11[3,0],ymm5[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm5[3,0],ymm9[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,0],ymm1[2,3],ymm13[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,0],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm7[3,0],ymm11[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,0],ymm10[2,3],ymm11[6,4],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[2,3],ymm12[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,0],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] @@ -2739,9 +2729,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,0],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload @@ -2840,461 +2830,451 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: addq $2200, %rsp # imm = 0x898 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-ONLY-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: subq $1992, %rsp # imm = 0x7C8 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] ; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -3332,10 +3312,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3345,15 +3323,14 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: addq $1992, %rsp # imm = 0x7C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 853ea0fb70b0b..f658848e511c9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -588,212 +588,206 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movdqa 288(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm3 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm15[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movaps %xmm11, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -808,126 +802,123 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movapd %xmm9, 16(%rcx) +; SSE-NEXT: movapd %xmm12, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movapd %xmm0, 16(%r8) +; SSE-NEXT: movapd %xmm1, 16(%r8) ; SSE-NEXT: movapd %xmm2, 48(%r8) ; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm7, 32(%r8) +; SSE-NEXT: movapd %xmm8, 32(%r8) ; SSE-NEXT: movapd %xmm14, 16(%r9) ; SSE-NEXT: movapd %xmm15, 48(%r9) -; SSE-NEXT: movapd %xmm12, (%r9) -; SSE-NEXT: movapd %xmm1, 32(%r9) -; SSE-NEXT: addq $296, %rsp # imm = 0x128 +; SSE-NEXT: movapd %xmm13, (%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1],ymm5[1,3],ymm3[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[3,0],ymm3[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm5[1,3],ymm4[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[3,0],ymm1[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm5[2,0],ymm14[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm2[2,0],ymm3[7,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,1],ymm0[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm7[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm5[2,2],ymm14[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] @@ -936,17 +927,17 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm12[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] @@ -956,18 +947,18 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -975,99 +966,99 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <1,6,3,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <1,6,3,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0] +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm14, %ymm7 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm14[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm6[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <4,1,6,u> +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7] ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm6, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1077,10 +1068,10 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: popq %rax @@ -1226,66 +1217,63 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm11 +; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1303,25 +1291,26 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 592(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1345,284 +1334,292 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm13[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1630,112 +1627,109 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckhdq (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 16(%rcx) -; SSE-NEXT: movapd %xmm6, 112(%r8) -; SSE-NEXT: movapd %xmm8, 96(%r8) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rcx) +; SSE-NEXT: movapd %xmm7, 112(%r8) +; SSE-NEXT: movapd %xmm9, 96(%r8) ; SSE-NEXT: movapd %xmm10, 80(%r8) -; SSE-NEXT: movapd %xmm11, 64(%r8) -; SSE-NEXT: movapd %xmm15, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movapd %xmm12, 64(%r8) +; SSE-NEXT: movapd %xmm13, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%r8) ; SSE-NEXT: movapd %xmm0, 112(%r9) ; SSE-NEXT: movapd %xmm1, 96(%r9) ; SSE-NEXT: movapd %xmm2, 80(%r9) ; SSE-NEXT: movapd %xmm3, 64(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) ; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm7, 16(%r9) -; SSE-NEXT: movapd %xmm9, (%r9) +; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: movapd %xmm8, (%r9) ; SSE-NEXT: addq $904, %rsp # imm = 0x388 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: subq $952, %rsp # imm = 0x3B8 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -1744,82 +1738,84 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -1829,26 +1825,26 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1861,273 +1857,268 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm15[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[2,0],ymm13[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm7[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm10[0,0],ymm13[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[0,0],ymm5[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm6[2,0],ymm14[7,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,1],ymm0[6,4],ymm6[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm14[2,0],ymm12[7,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm3[0,0],ymm14[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[0,0],ymm7[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm6[0,0],ymm11[5,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm12[2,1],ymm0[6,4],ymm12[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm5[0,0],ymm8[5,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm3[1,0],ymm5[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm9[3,0],ymm2[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm3[1,0],ymm14[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm4[3,0],ymm3[4,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm11[3,0],ymm2[4,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm12[3,0],ymm4[4,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm5[1,0],ymm8[6,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: addq $952, %rsp # imm = 0x3B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm12 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -2135,173 +2126,174 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm8 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm15[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm13[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm13[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) @@ -2566,33 +2558,33 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm6 +; SSE-NEXT: movdqa 448(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa 432(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 400(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] @@ -2602,11 +2594,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2616,18 +2609,16 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2673,32 +2664,32 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm6 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm15 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 592(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2718,7 +2709,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -2732,41 +2723,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa 496(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm9 +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm13 +; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2777,17 +2770,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1120(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1168(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1152(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2798,45 +2791,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2845,15 +2836,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2863,49 +2854,75 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 784(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2913,10 +2930,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa 1104(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2924,11 +2941,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa 1024(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2936,12 +2954,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm1 +; SSE-NEXT: movdqa 1264(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2949,25 +2967,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1024(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm1 +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2975,27 +2980,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3004,37 +3005,30 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] @@ -3065,20 +3059,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3093,106 +3089,123 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3201,8 +3214,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3211,20 +3224,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] @@ -3236,107 +3258,77 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -3344,8 +3336,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3353,8 +3344,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -3362,7 +3352,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3370,8 +3360,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload @@ -3383,12 +3372,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3405,15 +3394,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -3529,7 +3517,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%rcx) ; SSE-NEXT: movapd %xmm13, 240(%r8) -; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 208(%r8) @@ -3575,7 +3563,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: addq $1928, %rsp # imm = 0x788 ; SSE-NEXT: retq @@ -3583,322 +3571,321 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-LABEL: load_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2488, %rsp # imm = 0x9B8 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm6[1,3],ymm2[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm5[1,3],ymm2[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm8[1,3],ymm2[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm9[1,3],ymm2[6,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm7[1,3],ymm2[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm14[1,3],ymm2[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm10[1,3],ymm2[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm11[1,3],ymm2[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[2,0],ymm0[7,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,1],ymm0[6,4],ymm4[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -3911,10 +3898,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,1],ymm0[6,4],ymm9[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3930,10 +3917,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3943,18 +3930,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[0,0],ymm1[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -3969,24 +3957,24 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm0[0,0],ymm11[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm0[0,0],ymm12[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[2,1],ymm2[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm7[2,1],ymm2[6,4],ymm7[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] @@ -3994,27 +3982,26 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,0],ymm0[0,0],ymm14[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm13[2,1],ymm3[6,4],ymm13[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2],xmm12[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm12 = xmm12[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm1[0,0],ymm12[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm1[0,0],ymm11[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] @@ -4029,9 +4016,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm13[2,0],mem[1,0],ymm13[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4039,8 +4026,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0],ymm1[3,0],ymm8[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] @@ -4053,9 +4040,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm9[3,0],ymm15[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4074,8 +4062,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm6[1,0],ymm1[6,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4083,11 +4070,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm3[1,0],ymm11[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4095,26 +4082,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm10[3,0],ymm9[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm10[1,0],ymm14[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm6[3,0],ymm3[4,4],ymm6[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm6[2,2],ymm15[6,4],ymm6[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4122,22 +4110,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm8[3,0],ymm1[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4149,26 +4138,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] @@ -4176,16 +4166,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload @@ -4206,13 +4195,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload @@ -4309,9 +4298,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4321,17 +4310,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] @@ -4340,44 +4329,44 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4405,11 +4394,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4421,28 +4410,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd $51, (%rsp), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 @@ -4450,75 +4440,77 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm7[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4529,12 +4521,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 @@ -4543,10 +4533,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -4554,12 +4545,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -4567,12 +4557,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm9 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 @@ -4581,8 +4572,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] @@ -4593,12 +4584,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 @@ -4606,11 +4596,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 @@ -4618,9 +4610,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4633,8 +4625,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload @@ -4645,9 +4638,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload @@ -4661,17 +4653,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload @@ -4680,73 +4672,73 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm7, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4759,14 +4751,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -4780,19 +4772,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm12[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload @@ -4801,14 +4793,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -4817,60 +4809,60 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX2-ONLY-NEXT: vzeroupper @@ -4879,201 +4871,203 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i32_stride5_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> +; AVX512F-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5081,201 +5075,203 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index d179de0a039d3..2d8ada07be825 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -19,29 +19,29 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) +; SSE-NEXT: movq %xmm0, (%r9) ; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; @@ -255,47 +255,48 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm9, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm4, (%rsi) ; SSE-NEXT: movapd %xmm3, (%rdx) ; SSE-NEXT: movapd %xmm5, (%rcx) -; SSE-NEXT: movapd %xmm10, (%r8) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movapd %xmm6, (%r8) +; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -433,124 +434,122 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: movapd %xmm10, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rdx) ; SSE-NEXT: movapd %xmm11, (%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm4, 16(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) -; SSE-NEXT: movapd %xmm0, 16(%r9) -; SSE-NEXT: movapd %xmm5, (%r9) +; SSE-NEXT: movapd %xmm8, (%r8) +; SSE-NEXT: movapd %xmm10, 16(%r9) +; SSE-NEXT: movapd %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 16(%rax) -; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm2, 16(%rax) +; SSE-NEXT: movapd %xmm9, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf8: @@ -953,511 +952,516 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movdqa 240(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 336(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm7 -; SSE-NEXT: movdqa 288(%rdi), %xmm12 -; SSE-NEXT: movdqa 304(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: subq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa 240(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa 336(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa 304(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movapd %xmm14, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movapd %xmm15, 16(%r8) +; SSE-NEXT: movapd %xmm12, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm2, 16(%r9) ; SSE-NEXT: movapd %xmm3, 32(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) -; SSE-NEXT: movapd %xmm15, (%r9) +; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm13, 16(%rax) ; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm12, 48(%rax) +; SSE-NEXT: movapd %xmm8, 48(%rax) ; SSE-NEXT: movapd %xmm10, (%rax) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,2],ymm4[6,4],ymm6[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm2[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm2[0,0],ymm3[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm1[2,2],ymm4[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm5[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[0,1] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[0],ymm4[3],ymm12[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[3,0],ymm6[1,0],ymm13[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,0],ymm6[2,3],ymm11[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1],ymm14[1,3],ymm7[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm12[1,3],ymm4[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,1],ymm8[2,0],ymm13[6,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0,1],ymm10[2],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0],ymm5[2,0],ymm7[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,1],ymm6[2,0],ymm4[6,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,0],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[3,1],ymm7[4,5],ymm5[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm3[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm8[2,1],ymm9[7,5],ymm8[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm4[3,1],ymm0[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[3,1],xmm15[3,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm6[2,1],ymm13[7,5],ymm6[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm7[1],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm8[0,0],ymm0[6,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2],ymm9[2,0],ymm8[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm2[1],ymm6[0],ymm2[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,0],ymm13[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm12[0,0],ymm9[6,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2],ymm15[2,0],ymm12[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,3],ymm0[2,0],ymm8[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps $215, (%rsp), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,3],ymm4[2,0],ymm12[4,7],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm2[2,0],ymm13[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,0],ymm14[0,0],ymm11[6,4],ymm14[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,3],ymm7[2,0],ymm8[4,7],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm14[1,0],ymm11[7,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,3],ymm5[2,0],ymm14[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[2,0],ymm10[5,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1467,69 +1471,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1538,78 +1543,79 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm2 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-SLOW-NEXT: vmovaps %ymm13, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $200, %rsp +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $200, %rsp -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 @@ -1617,25 +1623,25 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1645,41 +1651,41 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] @@ -1690,23 +1696,24 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1715,56 +1722,56 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] ; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm14 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1],ymm4[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload @@ -1782,37 +1789,37 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1822,69 +1829,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1893,68 +1901,69 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2152,478 +2161,477 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: subq $1032, %rsp # imm = 0x408 +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 528(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm15 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa 496(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa 336(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 720(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa 736(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: movdqa 592(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa 512(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa 752(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa 704(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm9 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movapd %xmm15, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2632,6 +2640,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, %xmm11 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2641,7 +2650,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm14, %xmm12 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2649,20 +2658,20 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, %xmm10 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2670,21 +2679,22 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2743,9 +2753,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -2758,32 +2768,31 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, 112(%r9) ; SSE-NEXT: movapd %xmm3, 96(%r9) ; SSE-NEXT: movapd %xmm4, 80(%r9) -; SSE-NEXT: movapd %xmm5, 64(%r9) -; SSE-NEXT: movapd %xmm6, 48(%r9) -; SSE-NEXT: movapd %xmm13, 32(%r9) -; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movapd %xmm6, 64(%r9) +; SSE-NEXT: movapd %xmm8, 48(%r9) +; SSE-NEXT: movapd %xmm9, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 112(%rax) -; SSE-NEXT: movapd %xmm10, 96(%rax) -; SSE-NEXT: movapd %xmm7, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movapd %xmm13, 96(%rax) +; SSE-NEXT: movapd %xmm15, 80(%rax) +; SSE-NEXT: movapd %xmm10, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) -; SSE-NEXT: movapd %xmm12, (%rax) -; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: addq $1032, %rsp # imm = 0x408 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 @@ -2791,11 +2800,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 @@ -2803,10 +2812,10 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2814,15 +2823,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm5[0,0],ymm4[6,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,2],ymm0[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm6[0,0],ymm4[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2845,18 +2854,18 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2866,54 +2875,54 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[0,1] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm11[1,3],ymm0[7,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,0],ymm5[1,0],ymm14[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,0],xmm4[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,2],xmm4[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload @@ -2921,73 +2930,73 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm9[1,0],ymm5[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm9[1,0],ymm11[7,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm13[1,3],ymm12[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,1],ymm11[2,0],ymm5[6,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,1],ymm6[2,0],ymm1[6,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] @@ -2996,45 +3005,45 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[3,1],ymm5[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm4[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm13[2,1],ymm5[7,5],ymm13[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[3,1],xmm14[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm6[2,1],ymm1[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm6[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm11[2,1],ymm2[7,5],ymm11[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -3042,7 +3051,6 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -3051,86 +3059,95 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm13[0],ymm1[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm0[0,0],ymm4[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,0],ymm0[4,6],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm15[0],ymm2[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,0],ymm12[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm3[2,0],ymm1[4,6],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0],xmm3[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm9[1],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,0],ymm7[4,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,0],ymm15[0,0],ymm13[6,4],ymm15[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm0[1,0],ymm4[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,1],ymm4[2,0],ymm10[5,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,3],ymm0[2,0],ymm5[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm13[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] @@ -3138,28 +3155,16 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[2,0],ymm1[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm2[2,0],ymm6[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm15[1,0],ymm13[7,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,3],ymm1[2,0],ymm15[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm3[1,3],ymm9[7,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm3[2,0],ymm7[5,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3192,7 +3197,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3200,11 +3205,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3212,247 +3217,248 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm14 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3461,161 +3467,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-SLOW-NEXT: # xmm14 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm11 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3624,170 +3634,170 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload @@ -3796,72 +3806,73 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3870,159 +3881,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm11 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4031,247 +4048,248 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4280,161 +4298,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -4796,11 +4818,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4822,72 +4845,72 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1152(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1168(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa 304(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm10 +; SSE-NEXT: movdqa 336(%rdi), %xmm7 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 672(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 @@ -4899,10 +4922,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1072(%rdi), %xmm0 @@ -4910,507 +4933,462 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1104(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1120(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1440(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1440(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1488(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1504(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 -; SSE-NEXT: movdqa 592(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm10 +; SSE-NEXT: movdqa 592(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa 640(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm8[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1008(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1024(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: movdqa 1392(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1408(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm15[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm14[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1360(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1392(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1408(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm9 +; SSE-NEXT: movdqa 656(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 848(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa 800(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 944(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 896(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 896(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] ; SSE-NEXT: movdqa 1040(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 992(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 992(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: movdqa 1136(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1088(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1088(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa 1232(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 1184(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 1232(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1328(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1280(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1424(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1376(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1520(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1472(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,3,3,3] @@ -5419,43 +5397,34 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -5472,29 +5441,39 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5510,29 +5489,59 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5540,11 +5549,23 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5554,17 +5575,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5574,7 +5597,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5584,20 +5608,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5605,17 +5629,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5625,71 +5649,68 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] @@ -5705,7 +5726,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5757,9 +5778,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5800,9 +5822,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5810,21 +5833,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5832,14 +5854,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5848,18 +5869,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] @@ -5994,11 +6017,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, 240(%r9) ; SSE-NEXT: movapd %xmm3, 224(%r9) ; SSE-NEXT: movapd %xmm5, 208(%r9) -; SSE-NEXT: movapd %xmm6, 192(%r9) -; SSE-NEXT: movapd %xmm7, 176(%r9) +; SSE-NEXT: movapd %xmm7, 192(%r9) +; SSE-NEXT: movapd %xmm13, 176(%r9) ; SSE-NEXT: movapd %xmm8, 160(%r9) ; SSE-NEXT: movapd %xmm9, 144(%r9) -; SSE-NEXT: movapd %xmm12, 128(%r9) +; SSE-NEXT: movapd %xmm11, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6017,7 +6040,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 240(%rax) -; SSE-NEXT: movapd %xmm13, 224(%rax) +; SSE-NEXT: movapd %xmm12, 224(%rax) ; SSE-NEXT: movapd %xmm15, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) @@ -6043,14 +6066,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: addq $2184, %rsp # imm = 0x888 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2536, %rsp # imm = 0x9E8 +; AVX1-ONLY-NEXT: subq $2584, %rsp # imm = 0xA18 ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 @@ -6075,10 +6097,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6094,9 +6115,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6112,18 +6134,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm13[0,0],ymm1[6,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,2],ymm0[6,4],ymm13[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm13[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6139,18 +6162,18 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm10[0,0],ymm1[6,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,2],ymm0[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm11[0,0],ymm1[6,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,2],ymm0[6,4],ymm11[6,6] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6202,9 +6225,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6220,273 +6243,273 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm4[0,0],ymm1[6,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,2],ymm0[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[0,2],xmm14[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,3],ymm14[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm13[1,0],ymm14[7,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,3],ymm0[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,0],xmm11[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[3,0],ymm11[1,0],ymm14[7,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,0],ymm11[2,3],ymm13[6,4],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm10[1,0],ymm11[7,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,3],ymm0[6,4],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[0,2],xmm9[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm10[1,3],ymm1[7,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,0],ymm8[1,0],ymm11[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,3],ymm0[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,2],xmm7[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm6[1,0],ymm0[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,3],ymm0[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,0],xmm5[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,2],xmm5[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm4[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm7[1,3],ymm1[7,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[1,0],ymm0[7,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,0],xmm3[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm4[1,3],ymm1[7,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm4[1,3],ymm0[7,5],ymm4[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm1[1,0],ymm3[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm12[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm6[2,0],ymm1[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] @@ -6496,114 +6519,114 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,0],ymm2[2,0],ymm13[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm5[2,0],ymm3[6,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[3,1],ymm10[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[3,1],ymm8[4,5],ymm6[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[2,1],ymm10[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm14[3,1],ymm1[4,5],ymm14[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm2[3,1],ymm13[4,5],ymm2[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm7[2,1],ymm3[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6614,9 +6637,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm8[2,1],ymm6[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6632,18 +6655,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm8[0,0],ymm2[6,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm1[2,0],ymm8[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6652,16 +6676,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6676,7 +6697,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] @@ -6691,7 +6712,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6716,7 +6737,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] @@ -6725,8 +6746,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6740,14 +6761,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm2[0,0],ymm10[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6756,223 +6777,223 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 1232(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm1[0,0],ymm9[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[1],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,0],ymm0[0,0],ymm7[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm8[1,0],ymm13[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm13[2,0],ymm8[4,7],ymm13[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm13[2,0],ymm10[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,3],ymm8[2,0],ymm6[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm5[1,0],ymm8[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm8[2,0],ymm5[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm8[2,0],ymm4[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1],ymm12[2,0],ymm13[5,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm8[2,0],ymm3[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm2[1,0],ymm10[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm8[2,0],ymm2[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,0],ymm1[1,0],ymm9[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm0[1,0],ymm7[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm7[2,0],ymm0[4,7],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm8[2,0],ymm9[5,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1],ymm13[2,0],ymm10[5,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) @@ -6980,9 +7001,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) -; AVX1-ONLY-NEXT: addq $2536, %rsp # imm = 0x9E8 +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $2584, %rsp # imm = 0xA18 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6992,44 +7013,44 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7037,7 +7058,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -7053,7 +7074,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7062,7 +7083,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7078,7 +7099,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7087,7 +7108,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7095,21 +7116,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7117,21 +7138,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 @@ -7144,16 +7165,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -7161,73 +7182,73 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -7279,8 +7300,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -7298,17 +7319,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] @@ -7370,49 +7391,49 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] @@ -7428,9 +7449,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7457,27 +7478,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] @@ -7485,20 +7506,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7508,17 +7529,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7544,9 +7565,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -7557,138 +7578,140 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -7697,22 +7720,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7720,8 +7734,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7729,175 +7743,184 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX2-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7905,7 +7928,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -7921,7 +7944,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7930,7 +7953,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7941,12 +7964,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7955,7 +7978,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7963,21 +7986,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7985,21 +8008,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 @@ -8012,16 +8035,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8029,205 +8052,206 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2,3],ymm1[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -8236,151 +8260,150 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, (%rsp), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8392,8 +8415,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -8406,86 +8429,87 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -8494,64 +8518,64 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -8560,161 +8584,160 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $2472, %rsp # imm = 0x9A8 +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -8724,44 +8747,44 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8769,7 +8792,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -8785,7 +8808,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -8794,7 +8817,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -8810,7 +8833,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -8819,7 +8842,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 @@ -8827,21 +8850,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm1 @@ -8849,21 +8872,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 @@ -8876,16 +8899,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8893,73 +8916,73 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -9011,8 +9034,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -9030,17 +9053,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] @@ -9102,49 +9125,49 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] @@ -9160,9 +9183,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9189,27 +9212,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] @@ -9217,20 +9240,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9240,17 +9263,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9276,9 +9299,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -9289,138 +9312,140 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -9429,22 +9454,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9452,8 +9477,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9461,8 +9486,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9471,117 +9496,117 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -9589,350 +9614,349 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i32_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: movw $31, %ax ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -9940,350 +9964,349 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: movw $31, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 97f499968e8e0..ab44439f98ff3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -281,56 +281,56 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm11[0],xmm7[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm6[0],xmm2[1] -; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movapd %xmm9, (%rcx) +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm5, (%rsi) +; SSE-NEXT: movapd %xmm6, (%rdx) +; SSE-NEXT: movapd %xmm11, (%rcx) ; SSE-NEXT: movapd %xmm8, (%r8) ; SSE-NEXT: movapd %xmm4, (%r9) ; SSE-NEXT: movapd %xmm0, (%rdi) @@ -617,140 +617,146 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm10, (%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm15, (%rcx) -; SSE-NEXT: movapd %xmm12, 16(%rcx) -; SSE-NEXT: movapd %xmm4, (%r8) -; SSE-NEXT: movapd %xmm6, 16(%r8) -; SSE-NEXT: movapd %xmm14, (%r9) -; SSE-NEXT: movapd %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm13, 16(%rcx) +; SSE-NEXT: movapd %xmm15, (%r8) +; SSE-NEXT: movapd %xmm7, 16(%r8) +; SSE-NEXT: movapd %xmm2, (%r9) +; SSE-NEXT: movapd %xmm9, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, (%rax) -; SSE-NEXT: movapd %xmm3, 16(%rax) -; SSE-NEXT: popq %rax +; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf8: @@ -1245,297 +1251,306 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 +; SSE-NEXT: movdqa 224(%rdi), %xmm15 +; SSE-NEXT: movdqa 240(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm7 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa 416(%rdi), %xmm8 +; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm15[0],xmm9[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm14[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm15[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm15[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1552,24 +1567,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm7, 48(%r9) -; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm9, (%r9) +; SSE-NEXT: movapd %xmm9, 48(%r9) +; SSE-NEXT: movapd %xmm14, 32(%r9) +; SSE-NEXT: movapd %xmm12, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm1, 32(%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) +; SSE-NEXT: movapd %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movapd %xmm4, 48(%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf16: @@ -1579,7 +1594,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 @@ -1590,8 +1605,8 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 @@ -1606,11 +1621,11 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 @@ -1618,173 +1633,173 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm5[2,2],ymm7[5,5],ymm5[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm8[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm2[3,3],ymm1[4,4],ymm2[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm6[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm4[2,2],ymm13[5,5],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0],xmm9[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm3[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,1],ymm12[2,0],ymm13[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,2],ymm5[2,0],ymm1[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,1],ymm6[1,3],ymm2[4,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm4[2,0],ymm0[4,6],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm8[2,0],ymm7[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm4[1,3],ymm5[4,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,2],ymm8[2,0],ymm0[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm6[2,0],ymm2[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,1],ymm8[3,3],ymm7[6,5],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,1],ymm6[3,3],ymm2[6,5],ymm6[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,1],ymm3[3,3],ymm5[6,5],ymm3[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[0,0],ymm13[1,0],ymm14[4,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,1],ymm4[3,3],ymm5[6,5],ymm4[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm3[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm12[1,0],ymm11[4,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm6[0,0],ymm2[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,0],ymm6[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm6[0,0],ymm5[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,0],ymm12[2,0],ymm11[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1802,11 +1817,11 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2042,37 +2057,35 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm12[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 @@ -2088,67 +2101,67 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -2161,105 +2174,106 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm13[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2494,115 +2508,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512F-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm14 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm7, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 ; AVX512F-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm6 {%k2} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2611,115 +2625,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm6 {%k2} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2744,503 +2758,491 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 640(%rdi), %xmm2 -; SSE-NEXT: movdqa 608(%rdi), %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm8 +; SSE-NEXT: subq $1160, %rsp # imm = 0x488 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: movdqa 608(%rdi), %xmm4 +; SSE-NEXT: movdqa 560(%rdi), %xmm10 +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 -; SSE-NEXT: movdqa 496(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm9 +; SSE-NEXT: movdqa 496(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm2 -; SSE-NEXT: movdqa 832(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm3 +; SSE-NEXT: movdqa 832(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 752(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 720(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] ; SSE-NEXT: movdqa 704(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3249,54 +3251,59 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3316,9 +3323,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -3326,22 +3332,20 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -3350,7 +3354,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -3413,7 +3418,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) @@ -3421,7 +3426,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r9) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%r9) @@ -3436,9 +3441,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movapd %xmm15, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3459,18 +3464,18 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm6, 48(%rax) ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 +; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: addq $1160, %rsp # imm = 0x488 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 @@ -3482,29 +3487,32 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 @@ -3513,8 +3521,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3524,27 +3533,24 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3554,14 +3560,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3580,461 +3586,453 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm9[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,0],ymm1[3,3],ymm6[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm11[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm2[3,3],ymm13[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm2[3,3],ymm12[4,4],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm4[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm11[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm6[2,2],ymm4[5,5],ymm6[6,6] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3],ymm3[0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[2,2],ymm3[5,5],ymm8[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm15[3,3],ymm5[4,4],ymm15[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0],xmm2[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm5[0,1] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0],ymm13[3,3],ymm15[4,4],ymm13[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[0,3],ymm13[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[2,1],ymm13[2,0],ymm11[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm10[0,3],ymm13[7,5],ymm10[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,1],ymm13[2,0],ymm7[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm8[0,3],ymm13[7,5],ymm8[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm13[2,0],ymm9[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm15[1,2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[0,3],ymm15[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm15[2,0],ymm5[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm9[0,3],ymm15[7,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,1],ymm15[2,0],ymm8[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,1],ymm15[2,0],ymm4[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm3[0],ymm12[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm3[0,3],ymm14[7,5],ymm3[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm14[0,3],ymm8[7,5],ymm14[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1],ymm8[2,0],ymm4[6,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0],ymm1[0,0],ymm6[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1],ymm6[0,2],ymm1[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm10[2,0],ymm5[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2],ymm10[2,0],ymm13[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,1],ymm0[0,2],ymm11[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm4[1,3],ymm2[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,2],ymm9[2,0],ymm11[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[1,0],ymm4[0,0],ymm3[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm5[0,2],ymm4[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm6[1,3],ymm8[4,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2],ymm5[2,0],ymm9[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,0],ymm4[0,0],ymm13[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm9[1,3],ymm1[4,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[0,2],ymm8[2,0],ymm3[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,0],ymm14[2,0],ymm8[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm8[1,3],ymm1[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm10[2,0],ymm12[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm4[0,0],ymm14[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,1],ymm13[1,3],ymm7[4,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,2],ymm10[2,0],ymm6[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm7[1,3],ymm5[4,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm6[2,0],ymm15[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm8[2,0],ymm1[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,0],ymm9[2,0],ymm3[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm3[0,0],ymm11[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,0],ymm10[2,0],ymm2[6,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,0],ymm5[2,0],ymm15[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm12[0,0],ymm15[7,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[1,0],ymm7[2,0],ymm5[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm12[2,0],ymm1[6,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm2[2,0],ymm14[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,1],ymm9[3,3],ymm3[6,5],ymm9[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,0],ymm12[2,0],ymm15[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,1],ymm8[3,3],ymm3[6,5],ymm8[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,0],ymm15[1,0],ymm12[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm14[0,1],xmm9[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm12[1,0],ymm9[4,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, (%rsp), %xmm11, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,1],xmm13[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm5[3,3],ymm6[6,5],ymm5[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[3,3],ymm0[6,5],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[2,0],ymm2[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[0,0],ymm6[1,0],ymm5[4,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[0,1],xmm1[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,0],ymm0[0,0],ymm4[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,0],ymm15[2,0],ymm12[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,0],ymm13[1,0],ymm5[4,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm12[2,0],ymm9[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm13[2,0],ymm5[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm6[2,0],ymm5[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4057,7 +4055,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -4071,12 +4069,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4093,7 +4091,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -4108,14 +4106,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -4135,13 +4133,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4159,7 +4157,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -4172,20 +4170,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4197,114 +4196,114 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4317,237 +4316,238 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastd 772(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 884(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 664(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 888(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 528(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 752(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4555,24 +4555,25 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-LABEL: load_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -4585,15 +4586,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm7[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4605,13 +4604,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4622,11 +4619,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4652,14 +4649,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4667,20 +4664,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4695,360 +4691,368 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm3[1],xmm14[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] ; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,2],ymm5[1,3],ymm14[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[0,2],ymm5[1,3],ymm10[4,6],ymm5[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm4[1],mem[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm4[1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm15[1,3],ymm2[4,6],ymm15[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm10, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) ; AVX2-FAST-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -5072,7 +5076,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -5086,12 +5090,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5108,7 +5112,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -5123,14 +5127,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5150,13 +5154,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5174,7 +5178,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -5187,20 +5191,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5212,114 +5217,114 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -5332,237 +5337,238 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 772(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 884(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 664(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 888(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 528(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 752(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -5972,48 +5978,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2456, %rsp # imm = 0x998 -; SSE-NEXT: movdqa 1088(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm3 +; SSE-NEXT: movdqa 1088(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1008(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm5 +; SSE-NEXT: movdqa 1024(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm13 +; SSE-NEXT: movdqa 608(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 576(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm1 @@ -6030,635 +6036,632 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 976(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 976(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 944(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm11 +; SSE-NEXT: movdqa 336(%rdi), %xmm12 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm4 +; SSE-NEXT: movdqa 384(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm13 -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1248(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1312(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1280(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 1312(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1680(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1280(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1680(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1696(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movdqa 1760(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1728(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm8 ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm11 ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm14 +; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1568(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movdqa 1200(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1568(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1584(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1648(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movdqa 1648(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1616(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1616(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm4[0],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 928(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 1152(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1712(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 1600(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 1152(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1376(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1712(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1600(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 960(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1296(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 960(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1184(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1296(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1520(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1632(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1744(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6667,46 +6670,42 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1104(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1328(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1440(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6717,7 +6716,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 1552(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6732,140 +6732,159 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1776(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6874,22 +6893,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6898,8 +6906,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6908,60 +6915,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6970,8 +6965,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6980,25 +6975,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7007,8 +7003,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7017,8 +7013,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7027,25 +7022,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7053,24 +7057,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -7080,7 +7087,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -7088,22 +7096,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] @@ -7113,36 +7119,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] @@ -7175,17 +7183,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] @@ -7201,12 +7210,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -7428,12 +7437,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX1-ONLY-NEXT: subq $3176, %rsp # imm = 0xC68 ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -7442,12 +7451,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 @@ -7463,21 +7471,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] @@ -7490,14 +7497,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7521,7 +7529,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -7537,21 +7545,21 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7574,47 +7582,47 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7628,75 +7636,76 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0],ymm1[3,3],ymm8[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm9[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 @@ -7713,13 +7722,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 @@ -7736,8 +7745,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7759,12 +7767,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 @@ -7774,406 +7783,405 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm3[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[3,3],ymm1[4,4],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm15[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm10[3,3],ymm1[4,4],ymm10[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm10[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm12[0,3],ymm14[7,5],ymm12[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm5[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,0],ymm14[0,0],ymm12[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm6[0,3],ymm12[7,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm12[0,1],mem[1,3],ymm12[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm15[2,0],ymm8[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm14[2,0],ymm12[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm14[0,0],ymm7[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,1],ymm4[1,3],ymm8[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,2],ymm14[2,0],ymm0[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm4[0,0],ymm7[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1],ymm7[0,2],ymm4[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm1[1,3],ymm4[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,2],ymm14[2,0],ymm15[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm12[0,0],ymm8[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,1],ymm1[1,3],ymm5[4,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,0],ymm1[0,0],ymm4[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,1],ymm7[0,2],ymm1[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2],ymm15[2,0],ymm1[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm10[0,2],ymm1[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,1],ymm10[1,3],ymm1[4,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[0,2],ymm14[2,0],ymm11[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm15[2,0],ymm13[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm10[0,2],ymm0[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[1,3],ymm0[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,2],ymm15[2,0],ymm11[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[0,0],ymm5[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[0,0],ymm6[5,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,2],ymm9[2,0],ymm7[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm10[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1],ymm8[1,3],ymm9[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm5[2,0],ymm15[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,2],ymm7[2,0],ymm11[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm10[1,3],ymm4[4,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm5[2,0],ymm14[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1],ymm1[1,3],ymm11[4,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2],ymm6[2,0],ymm2[4,6],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm9[2,0],ymm2[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm5[0,0],ymm2[7,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,0],ymm1[2,0],ymm11[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,0],ymm0[6,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm3[2,0],ymm13[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm10[2,0],ymm4[5,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0],ymm0[0,0],ymm15[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8181,112 +8189,91 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm8[2,0],ymm2[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm13[2,1],mem[3,3],ymm13[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm10[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[0,0],ymm15[1,0],ymm9[4,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,1],xmm11[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0],xmm9[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0],ymm14[1,0],ymm13[4,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm10[2,1],mem[3,3],ymm10[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm11[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -8295,33 +8282,51 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0],ymm7[1,0],ymm6[4,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] @@ -8333,12 +8338,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -8351,10 +8356,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm8[3,3],ymm2[6,5],ymm8[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8367,9 +8373,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] @@ -8377,34 +8383,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm15[2,0],ymm2[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm1[0,0],ymm10[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -8413,6 +8419,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0],ymm1[0,0],ymm3[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] @@ -8433,91 +8458,70 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,0],ymm3[0,0],ymm7[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,0],ymm7[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[0,0],ymm5[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,0],ymm8[0,0],ymm9[7,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[0,0],ymm8[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,0],ymm10[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,0],ymm10[0,0],ymm5[7,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8526,14 +8530,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8616,25 +8620,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: addq $3176, %rsp # imm = 0xC68 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: subq $2680, %rsp # imm = 0xA78 ; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 @@ -8681,12 +8685,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -8698,10 +8703,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8738,14 +8744,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -8758,19 +8764,17 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -8787,10 +8791,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -8832,10 +8836,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] @@ -8847,20 +8851,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8876,15 +8880,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8902,11 +8905,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8924,10 +8927,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8945,8 +8950,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8963,17 +8967,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8994,48 +8997,46 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -9044,32 +9045,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9080,75 +9080,77 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -9157,20 +9159,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -9182,13 +9183,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9197,8 +9199,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9212,502 +9214,498 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm0 = mem[0,0] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm7 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rax) -; AVX2-SLOW-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) +; AVX2-SLOW-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm11 +; AVX2-FAST-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -9718,12 +9716,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm15[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -9734,13 +9733,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm11[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -9751,14 +9752,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -9772,9 +9775,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -9791,11 +9794,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9860,201 +9863,205 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] ; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 @@ -10069,8 +10076,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -10079,34 +10085,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -10115,79 +10121,79 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm11[1],mem[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,2],ymm14[1,3],ymm12[4,6],ymm14[5,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] +; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm13 ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm13[1,3],ymm11[4,6],ymm13[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] @@ -10197,8 +10203,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] @@ -10206,26 +10212,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm10[1,3],ymm7[4,6],ymm10[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm10 ; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -10234,7 +10239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] ; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 @@ -10250,25 +10255,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] -; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] +; AVX2-FAST-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10276,341 +10281,342 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 ; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm12, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm11[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm14[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10687,42 +10693,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) +; AVX2-FAST-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: subq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 @@ -10769,12 +10776,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -10786,10 +10794,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10826,14 +10835,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -10846,19 +10855,17 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -10875,10 +10882,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -10920,10 +10927,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] @@ -10935,20 +10942,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10964,15 +10971,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10990,11 +10996,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11012,10 +11018,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11033,8 +11041,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11051,17 +11058,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11082,48 +11088,46 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -11132,32 +11136,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11168,75 +11171,77 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11245,20 +11250,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -11270,13 +11274,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11285,8 +11290,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -11300,212 +11305,224 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] @@ -11516,590 +11533,571 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride7_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12109,60 +12107,60 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512F-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} ; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} @@ -12178,44 +12176,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 +; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) @@ -12223,10 +12220,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -12234,329 +12231,325 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12566,60 +12559,60 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} ; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} @@ -12635,44 +12628,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) @@ -12680,10 +12672,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 7e7398050087c..25db5a66461e8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -487,100 +487,99 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rdi), %xmm10 -; SSE-NEXT: movaps 192(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps 240(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE-NEXT: movaps 208(%rdi), %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm6, 16(%rdx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm11, (%r8) -; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: movaps %xmm13, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm11, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, (%rax) -; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf8: @@ -590,87 +589,87 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -683,7 +682,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -698,7 +697,6 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i32_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 @@ -708,13 +706,13 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm5 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm5 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 @@ -723,11 +721,11 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] @@ -738,71 +736,70 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm13[0],ymm9[2],ymm13[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -899,96 +896,94 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movaps 288(%rdi), %xmm1 -; SSE-NEXT: movaps 352(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm7 -; SSE-NEXT: movaps 416(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm11 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 128(%rdi), %xmm15 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm6[0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps 448(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 192(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps 32(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1002,70 +997,70 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm12 +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 432(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 400(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm14 ; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movaps 304(%rdi), %xmm11 -; SSE-NEXT: movaps 272(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1102,271 +1097,273 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r9) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm5, (%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm7, 32(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[4],ymm6[4],ymm8[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[4],ymm13[4],ymm14[5],ymm13[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm6[1,0],ymm9[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm2[1,0],ymm0[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[4],ymm11[4],ymm15[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm8[1,0],ymm7[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[2,0],ymm6[2,3],ymm1[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,0],ymm1[1,0],ymm0[5,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm15[2,3],ymm2[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm7[2,0],ymm1[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm7[2,0],ymm2[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[6],ymm11[6],ymm15[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,0],ymm3[3,0],ymm2[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0],ymm14[3,0],ymm13[7,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm4[3,0],ymm2[7,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm3[3,0],ymm1[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1393,254 +1390,255 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm8 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm9 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm7 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[4],ymm6[4],ymm15[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm7[0],ymm14[1],ymm7[1],ymm14[4],ymm7[4],ymm14[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[6],ymm6[6],ymm15[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm7[2],ymm14[3],ymm7[3],ymm14[6],ymm7[6],ymm14[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -1662,12 +1660,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1959,49 +1957,50 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 -; SSE-NEXT: movaps 544(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm5 +; SSE-NEXT: movaps 544(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 672(%rdi), %xmm6 +; SSE-NEXT: movaps 608(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm7 +; SSE-NEXT: movaps 576(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm10 +; SSE-NEXT: movaps 672(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 736(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movaps 512(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2012,11 +2011,12 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 448(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 416(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 416(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2024,179 +2024,177 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 992(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 928(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 288(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 832(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 800(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 304(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps 368(%rdi), %xmm4 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 304(%rdi), %xmm5 +; SSE-NEXT: movaps 272(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm7 +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 432(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 400(%rdi), %xmm1 @@ -2207,16 +2205,16 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 560(%rdi), %xmm2 +; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 528(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 592(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 560(%rdi), %xmm6 +; SSE-NEXT: movaps 528(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2224,13 +2222,13 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 720(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 688(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 656(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -2239,116 +2237,118 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 848(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 848(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 816(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 784(%rdi), %xmm1 +; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 784(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 976(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 976(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 944(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 912(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 912(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 16(%rdi), %xmm12 -; SSE-NEXT: movaps 48(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 16(%rdi), %xmm15 +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2381,7 +2381,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) @@ -2430,7 +2430,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2439,50 +2440,55 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 112(%rax) -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm8, 96(%rax) +; SSE-NEXT: movaps %xmm4, 80(%rax) +; SSE-NEXT: movaps %xmm13, 64(%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm6, 96(%rax) -; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm5, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps %xmm15, (%rax) ; SSE-NEXT: addq $952, %rsp # imm = 0x3B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; AVX1-ONLY-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2491,39 +2497,37 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 @@ -2541,242 +2545,241 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -2787,55 +2790,56 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2845,153 +2849,160 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm12[1,0],ymm8[5,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm5[1,0],ymm13[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm5[1,0],ymm3[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm8[1,0],ymm14[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm4[1,0],ymm6[5,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm2[1,0],ymm14[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[6],ymm14[6],ymm8[7],ymm14[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -2999,31 +3010,30 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm3[3,0],ymm7[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -3033,24 +3043,25 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,0],ymm4[3,0],ymm10[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,0],ymm13[3,0],ymm10[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) @@ -3104,7 +3115,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3112,40 +3124,39 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX1-ONLY-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm11 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm13 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm14 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 @@ -3162,17 +3173,17 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3192,7 +3203,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -3217,46 +3228,46 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm10, %xmm3 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3265,29 +3276,30 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -3295,15 +3307,15 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] @@ -3313,147 +3325,147 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 @@ -3466,21 +3478,22 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3492,95 +3505,88 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm10 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm15 -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3589,2453 +3595,2410 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm4[1],ymm10[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: movb $-64, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: movb $-64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -6062,16 +6025,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 -; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm7 +; SSE-NEXT: movaps 416(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps 384(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm3 @@ -6092,8 +6055,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -6101,21 +6064,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 256(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 736(%rdi), %xmm9 ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movaps 672(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm1 @@ -6126,16 +6089,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 544(%rdi), %xmm2 +; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 544(%rdi), %xmm15 +; SSE-NEXT: movaps 512(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6158,14 +6122,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 800(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 800(%rdi), %xmm14 +; SSE-NEXT: movaps 768(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6173,8 +6137,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1248(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1216(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 1216(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1184(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6186,11 +6150,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1120(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1088(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1120(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1088(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 1056(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1024(%rdi), %xmm1 @@ -6203,8 +6168,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1504(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1472(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 1472(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1440(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6218,8 +6183,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1376(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1344(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 1344(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1312(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6233,8 +6198,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1760(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1728(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 1728(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1696(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6248,14 +6213,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1632(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1600(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 1600(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1568(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1536(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 1568(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1536(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6263,8 +6229,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2016(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1984(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 1984(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1952(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6278,236 +6244,239 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1856(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 1856(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1824(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1792(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1824(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1792(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm14[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 176(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm1 @@ -6519,70 +6488,64 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm3 ; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 304(%rdi), %xmm4 ; SSE-NEXT: movaps 272(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm5 ; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 432(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 432(%rdi), %xmm6 ; SSE-NEXT: movaps 400(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 624(%rdi), %xmm9 ; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 560(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 560(%rdi), %xmm10 ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 752(%rdi), %xmm12 ; SSE-NEXT: movaps 720(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps 688(%rdi), %xmm13 +; SSE-NEXT: movaps 656(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 880(%rdi), %xmm12 +; SSE-NEXT: movaps 880(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 816(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 784(%rdi), %xmm1 @@ -6593,11 +6556,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1008(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 976(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm14 +; SSE-NEXT: movaps 976(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movaps 944(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 912(%rdi), %xmm1 @@ -6610,8 +6572,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1136(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1104(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 1104(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1072(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6623,11 +6585,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1264(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1232(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1264(%rdi), %xmm11 +; SSE-NEXT: movaps 1232(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps 1200(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1168(%rdi), %xmm1 @@ -6640,8 +6601,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1392(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1360(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 1360(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1328(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6655,8 +6616,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1520(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1488(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 1488(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1456(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6670,24 +6631,23 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1648(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1616(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 1616(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1584(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1552(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1776(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1744(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1776(%rdi), %xmm8 +; SSE-NEXT: movaps 1744(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1680(%rdi), %xmm1 @@ -6700,8 +6660,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1904(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1872(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 1872(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1840(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6715,225 +6675,234 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2032(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 2000(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 2000(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1968(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1968(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1936(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movaps 16(%rdi), %xmm13 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm11[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rsi) @@ -7124,44 +7093,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, 240(%rax) ; SSE-NEXT: movaps %xmm4, 224(%rax) ; SSE-NEXT: movaps %xmm5, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm12, 176(%rax) -; SSE-NEXT: movaps %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm9, 144(%rax) -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm11, 112(%rax) -; SSE-NEXT: movaps %xmm14, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps %xmm6, 176(%rax) +; SSE-NEXT: movaps %xmm7, 160(%rax) +; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm14, 128(%rax) +; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps %xmm9, 240(%rax) +; SSE-NEXT: movaps %xmm8, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps %xmm10, 192(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -7171,18 +7139,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm11, (%rax) ; SSE-NEXT: addq $2232, %rsp # imm = 0x8B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3688, %rsp # imm = 0xE68 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 @@ -7190,18 +7158,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -7223,9 +7192,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 @@ -7235,11 +7204,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7250,24 +7219,25 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7280,9 +7250,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm3 @@ -7291,11 +7261,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7306,18 +7276,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 @@ -7332,92 +7301,105 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] @@ -7425,99 +7407,85 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7538,8 +7506,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7548,10 +7515,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7559,8 +7526,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7577,11 +7545,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7590,19 +7557,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7613,146 +7580,147 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7765,11 +7733,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7777,10 +7746,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7791,50 +7761,50 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -7851,11 +7821,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7869,14 +7839,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7905,664 +7875,668 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[4],ymm2[4],ymm13[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm1[2,0],ymm8[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[4],mem[4],ymm12[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm11[1,0],ymm1[5,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm5[1,0],ymm1[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm15[1,0],ymm1[5,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm2[1,0],ymm6[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm8[1,0],ymm12[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm11[1,0],ymm7[5,4],ymm11[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm5[1,0],mem[1,0],ymm5[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm7[1,0],ymm10[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[3,0],ymm13[3,0],ymm11[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,0],ymm15[3,0],ymm1[7,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[3,0],ymm11[3,0],ymm13[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: addq $3688, %rsp # imm = 0xE68 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $3720, %rsp # imm = 0xE88 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: subq $3528, %rsp # imm = 0xDC8 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -8572,9 +8546,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm13 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 @@ -8588,19 +8562,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 @@ -8622,9 +8595,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8652,9 +8625,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8693,11 +8666,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -8710,11 +8683,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8740,11 +8713,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8753,56 +8726,57 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps %xmm13, %xmm9 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8816,12 +8790,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8831,16 +8806,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8850,16 +8826,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8869,29 +8846,32 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8905,13 +8885,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8925,13 +8905,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8942,145 +8922,145 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9104,7 +9084,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -9120,10 +9100,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 @@ -9133,11 +9113,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9151,8 +9129,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9163,12 +9140,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[2,3,2,3] @@ -9176,10 +9154,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -9189,8 +9167,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9216,12 +9193,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9233,16 +9209,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 @@ -9261,22 +9237,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9288,22 +9264,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9315,22 +9291,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9340,199 +9316,185 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm9 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm11 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9545,15 +9507,27 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1940(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload @@ -9562,113 +9536,114 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1528(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1784(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 2040(%rdi), %ymm0 @@ -9678,17 +9653,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -9732,5000 +9707,1402 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1756(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 2012(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3464, %rsp # imm = 0xD88 +; AVX2-ONLY-NEXT: addq $3528, %rsp # imm = 0xDC8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: movb $-64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: movb $-64, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: movb $-64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: movb $-64, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq +; AVX512F-LABEL: load_i32_stride8_vf64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512F-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: movb $-64, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm9 +; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: movb $-64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: load_i32_stride8_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512BW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index fb01a0ad31557..8fa1dd6a30b52 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -246,35 +246,35 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm11 ; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdi), %xmm10 ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm14 ; SSE-NEXT: movaps 128(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm12 ; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 80(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] @@ -284,12 +284,12 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, 96(%rsi) -; SSE-NEXT: movaps %xmm9, 112(%rsi) +; SSE-NEXT: movaps %xmm12, 96(%rsi) +; SSE-NEXT: movaps %xmm11, 112(%rsi) ; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm12, 80(%rsi) -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps %xmm14, 80(%rsi) +; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm13, 48(%rsi) ; SSE-NEXT: movaps %xmm10, (%rsi) ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps %xmm4, 112(%rdx) @@ -404,61 +404,61 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm15 @@ -468,46 +468,46 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, 224(%rsi) +; SSE-NEXT: movaps %xmm11, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) +; SSE-NEXT: movaps %xmm6, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -519,17 +519,17 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps %xmm14, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) ; SSE-NEXT: movaps %xmm13, 160(%rdx) ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -548,141 +548,141 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm14[1],ymm5[3],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride2_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm6[1],ymm14[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -732,62 +732,62 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i64_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 304(%rdi), %xmm10 -; SSE-NEXT: movaps 272(%rdi), %xmm8 -; SSE-NEXT: movaps 256(%rdi), %xmm6 -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm7 -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm9 -; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 160(%rdi), %xmm11 -; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 144(%rdi), %xmm5 ; SSE-NEXT: movaps 128(%rdi), %xmm12 -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -901,52 +901,52 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 -; SSE-NEXT: movaps 832(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 -; SSE-NEXT: movaps 864(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm4 -; SSE-NEXT: movaps 896(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] -; SSE-NEXT: movaps 944(%rdi), %xmm4 +; SSE-NEXT: movaps 864(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 896(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 944(%rdi), %xmm0 ; SSE-NEXT: movaps 928(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: movaps 976(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 976(%rdi), %xmm0 ; SSE-NEXT: movaps 960(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps 1008(%rdi), %xmm4 -; SSE-NEXT: movaps 992(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm3 +; SSE-NEXT: movaps 992(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movaps (%rdi), %xmm14 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, 496(%rsi) -; SSE-NEXT: movaps %xmm3, 480(%rsi) -; SSE-NEXT: movaps %xmm5, 464(%rsi) -; SSE-NEXT: movaps %xmm7, 448(%rsi) -; SSE-NEXT: movaps %xmm11, 432(%rsi) -; SSE-NEXT: movaps %xmm10, 416(%rsi) +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, 496(%rsi) +; SSE-NEXT: movaps %xmm1, 480(%rsi) +; SSE-NEXT: movaps %xmm2, 464(%rsi) +; SSE-NEXT: movaps %xmm9, 448(%rsi) +; SSE-NEXT: movaps %xmm7, 432(%rsi) +; SSE-NEXT: movaps %xmm12, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -995,14 +995,15 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps %xmm2, 496(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 496(%rdx) ; SSE-NEXT: movaps %xmm6, 480(%rdx) ; SSE-NEXT: movaps %xmm8, 464(%rdx) -; SSE-NEXT: movaps %xmm9, 448(%rdx) -; SSE-NEXT: movaps %xmm12, 432(%rdx) -; SSE-NEXT: movaps %xmm15, 416(%rdx) +; SSE-NEXT: movaps %xmm13, 448(%rdx) +; SSE-NEXT: movaps %xmm15, 432(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1051,126 +1052,125 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm14, (%rdx) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) @@ -1181,14 +1181,14 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 352(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) @@ -1213,167 +1213,167 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-LABEL: load_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index 88e8d2cff874c..ac1be65682a7b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -364,11 +364,11 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 272(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm2 ; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 112(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm12 ; SSE-NEXT: movapd 144(%rdi), %xmm6 ; SSE-NEXT: movapd 160(%rdi), %xmm14 ; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm12 +; SSE-NEXT: movapd 208(%rdi), %xmm11 ; SSE-NEXT: movapd 240(%rdi), %xmm10 ; SSE-NEXT: movapd 256(%rdi), %xmm13 ; SSE-NEXT: movapd 48(%rdi), %xmm9 @@ -385,66 +385,66 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm14 +; SSE-NEXT: movapd %xmm12, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1] +; SSE-NEXT: movapd %xmm11, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 336(%rdi), %xmm12 -; SSE-NEXT: movapd 352(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] -; SSE-NEXT: movapd 368(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: movapd 288(%rdi), %xmm2 -; SSE-NEXT: movapd 304(%rdi), %xmm5 +; SSE-NEXT: movapd 336(%rdi), %xmm13 +; SSE-NEXT: movapd 352(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] +; SSE-NEXT: movapd 368(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd 288(%rdi), %xmm0 +; SSE-NEXT: movapd 304(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 320(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 16(%rdi), %xmm5 ; SSE-NEXT: movapd %xmm5, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 320(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd (%rdi), %xmm5 -; SSE-NEXT: movapd 16(%rdi), %xmm8 -; SSE-NEXT: movapd %xmm8, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: movapd 32(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm3, 96(%rsi) +; SSE-NEXT: movapd 32(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm1, 96(%rsi) ; SSE-NEXT: movapd %xmm14, 32(%rsi) ; SSE-NEXT: movapd %xmm7, 112(%rsi) ; SSE-NEXT: movapd %xmm15, 48(%rsi) -; SSE-NEXT: movapd %xmm13, 64(%rsi) -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm11, 80(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm12, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movapd %xmm5, (%rdx) +; SSE-NEXT: movapd %xmm8, 64(%rsi) +; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm12, 80(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm0, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movapd %xmm13, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movapd %xmm2, (%rdx) ; SSE-NEXT: movapd %xmm10, 80(%rdx) ; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm0, 96(%rcx) -; SSE-NEXT: movapd %xmm4, 112(%rcx) +; SSE-NEXT: movapd %xmm6, 96(%rcx) +; SSE-NEXT: movapd %xmm11, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -453,7 +453,7 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movapd %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm4, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: addq $24, %rsp @@ -464,55 +464,55 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm7[1],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -639,56 +639,56 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 272(%rdi), %xmm3 -; SSE-NEXT: movapd 128(%rdi), %xmm1 -; SSE-NEXT: movapd 176(%rdi), %xmm5 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rdi), %xmm6 -; SSE-NEXT: movapd 112(%rdi), %xmm11 -; SSE-NEXT: movapd 144(%rdi), %xmm7 -; SSE-NEXT: movapd 160(%rdi), %xmm12 -; SSE-NEXT: movapd 192(%rdi), %xmm8 -; SSE-NEXT: movapd 208(%rdi), %xmm13 -; SSE-NEXT: movapd 240(%rdi), %xmm9 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm10 -; SSE-NEXT: movapd 64(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movapd 224(%rdi), %xmm6 +; SSE-NEXT: movapd 272(%rdi), %xmm9 +; SSE-NEXT: movapd 128(%rdi), %xmm5 +; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 80(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm10 +; SSE-NEXT: movapd 112(%rdi), %xmm0 +; SSE-NEXT: movapd 144(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd 192(%rdi), %xmm12 +; SSE-NEXT: movapd 208(%rdi), %xmm2 +; SSE-NEXT: movapd 240(%rdi), %xmm13 +; SSE-NEXT: movapd 256(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm7[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm4[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm12[0],xmm5[1] +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm5[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm0[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm6[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -740,80 +740,80 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 528(%rdi), %xmm15 -; SSE-NEXT: movapd 544(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 560(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 576(%rdi), %xmm11 +; SSE-NEXT: movapd 544(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 560(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 576(%rdi), %xmm12 ; SSE-NEXT: movapd 592(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] ; SSE-NEXT: movapd 608(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 624(%rdi), %xmm8 ; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 656(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd 672(%rdi), %xmm0 -; SSE-NEXT: movapd 688(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd 656(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 688(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] ; SSE-NEXT: movapd 704(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] -; SSE-NEXT: movapd 720(%rdi), %xmm2 -; SSE-NEXT: movapd 736(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 720(%rdi), %xmm4 +; SSE-NEXT: movapd 736(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE-NEXT: movapd 752(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd (%rdi), %xmm4 -; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 16(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd 32(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; SSE-NEXT: movapd %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm5, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movapd %xmm3, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movapd %xmm14, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movapd %xmm5, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movapd %xmm0, 224(%rdx) -; SSE-NEXT: movapd %xmm2, 240(%rdx) -; SSE-NEXT: movapd %xmm11, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movapd %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm11, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 224(%rdx) +; SSE-NEXT: movapd %xmm4, 240(%rdx) +; SSE-NEXT: movapd %xmm12, 192(%rdx) ; SSE-NEXT: movapd %xmm8, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) @@ -834,12 +834,12 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm4, (%rdx) +; SSE-NEXT: movapd %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movapd %xmm7, 240(%rcx) ; SSE-NEXT: movapd %xmm10, 224(%rcx) -; SSE-NEXT: movapd %xmm12, 208(%rcx) +; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -870,241 +870,233 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: subq $232, %rsp +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm3[1],ymm8[0],ymm3[3],ymm8[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm4[1],ymm5[0],ymm4[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm9[0],ymm4[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm2[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm13[1],ymm6[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm13[1],mem[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm3[1],mem[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm8[1],mem[2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 192(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm15, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $232, %rsp -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm5[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] @@ -1112,63 +1104,63 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) ; AVX2-ONLY-NEXT: addq $232, %rsp ; AVX2-ONLY-NEXT: vzeroupper @@ -1252,56 +1244,56 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movapd 272(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 128(%rdi), %xmm4 +; SSE-NEXT: movapd 272(%rdi), %xmm9 +; SSE-NEXT: movapd 224(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 ; SSE-NEXT: movapd 80(%rdi), %xmm5 -; SSE-NEXT: movapd 240(%rdi), %xmm6 -; SSE-NEXT: movapd 256(%rdi), %xmm11 -; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm12 -; SSE-NEXT: movapd 144(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm13 -; SSE-NEXT: movapd 96(%rdi), %xmm9 -; SSE-NEXT: movapd 112(%rdi), %xmm14 -; SSE-NEXT: movapd 48(%rdi), %xmm10 -; SSE-NEXT: movapd 64(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: movapd 240(%rdi), %xmm10 +; SSE-NEXT: movapd 256(%rdi), %xmm0 +; SSE-NEXT: movapd 192(%rdi), %xmm11 +; SSE-NEXT: movapd 208(%rdi), %xmm1 +; SSE-NEXT: movapd 144(%rdi), %xmm12 +; SSE-NEXT: movapd 160(%rdi), %xmm2 +; SSE-NEXT: movapd 96(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm5[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm5[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] +; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm6[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm7[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -1504,122 +1496,121 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1248(%rdi), %xmm2 ; SSE-NEXT: movapd 1264(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd 1280(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1296(%rdi), %xmm14 +; SSE-NEXT: movapd 1296(%rdi), %xmm15 ; SSE-NEXT: movapd 1312(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] ; SSE-NEXT: movapd 1328(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1344(%rdi), %xmm12 ; SSE-NEXT: movapd 1360(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm12[0],xmm8[1] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] ; SSE-NEXT: movapd 1376(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1392(%rdi), %xmm9 -; SSE-NEXT: movapd 1408(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd 1424(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1440(%rdi), %xmm4 -; SSE-NEXT: movapd 1456(%rdi), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd 1472(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1488(%rdi), %xmm1 -; SSE-NEXT: movapd 1504(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: movapd 1392(%rdi), %xmm10 +; SSE-NEXT: movapd 1408(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd 1424(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1440(%rdi), %xmm9 +; SSE-NEXT: movapd 1456(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd 1472(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1488(%rdi), %xmm0 +; SSE-NEXT: movapd 1504(%rdi), %xmm8 +; SSE-NEXT: movapd %xmm8, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd 1520(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm13[0] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] +; SSE-NEXT: movapd (%rdi), %xmm8 ; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm10[0],xmm7[1] -; SSE-NEXT: movapd 32(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movapd %xmm6, 496(%rsi) -; SSE-NEXT: movapd %xmm2, 480(%rsi) -; SSE-NEXT: movapd %xmm3, 464(%rsi) -; SSE-NEXT: movapd %xmm8, 448(%rsi) +; SSE-NEXT: movapd %xmm5, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; SSE-NEXT: movapd 32(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm3, 496(%rsi) +; SSE-NEXT: movapd %xmm1, 480(%rsi) +; SSE-NEXT: movapd %xmm2, 464(%rsi) +; SSE-NEXT: movapd %xmm7, 448(%rsi) ; SSE-NEXT: movapd %xmm11, 432(%rsi) -; SSE-NEXT: movapd %xmm15, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm1, 496(%rdx) -; SSE-NEXT: movapd %xmm4, 480(%rdx) -; SSE-NEXT: movapd %xmm9, 464(%rdx) +; SSE-NEXT: movapd %xmm14, 416(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 400(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 384(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 368(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm6, (%rsi) +; SSE-NEXT: movapd %xmm0, 496(%rdx) +; SSE-NEXT: movapd %xmm9, 480(%rdx) +; SSE-NEXT: movapd %xmm10, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) -; SSE-NEXT: movapd %xmm14, 432(%rdx) +; SSE-NEXT: movapd %xmm15, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1672,7 +1663,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm10, (%rdx) +; SSE-NEXT: movapd %xmm8, (%rdx) ; SSE-NEXT: movapd %xmm13, 496(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rcx) @@ -1734,59 +1725,62 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm5, (%rcx) +; SSE-NEXT: movapd %xmm4, (%rcx) ; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm2[0],ymm7[3],ymm2[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2] ; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm9[0],ymm8[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2] ; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm11[0],ymm10[3],ymm11[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm11[0],ymm0[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1797,10 +1791,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1812,177 +1806,179 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] ; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm4[0],ymm12[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[1],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0],ymm7[1],ymm1[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm9[1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 448(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm1[1],ymm6[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm1[1],ymm11[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm9[1],ymm4[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm4[1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm11[1],mem[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm11[0],ymm13[1],ymm11[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovapd %ymm7, 448(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 320(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 416(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2025,21 +2021,19 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) @@ -2053,7 +2047,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1128, %rsp # imm = 0x468 +; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2107,182 +2101,182 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -2440,138 +2434,124 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $200, %rsp -; AVX512-NEXT: vmovaps 1472(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm16 -; AVX512-NEXT: vmovaps 1280(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm15 -; AVX512-NEXT: vmovaps 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 -; AVX512-NEXT: vmovaps 896(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,3,6,9,12,15,u,u> +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512-NEXT: vpermt2q %zmm23, %zmm13, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm12, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-NEXT: vpermt2q %zmm14, %zmm13, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm29 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 +; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,4,7,10,13,u,u,u> +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm30 -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 +; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm26, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm18, %zmm26, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm26, %zmm22 -; AVX512-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm26, %zmm21 -; AVX512-NEXT: vpermt2q %zmm16, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm16 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm13 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm26 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm27 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm26 +; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm22 +; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm11 +; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm27 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm19 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm23 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm10 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rdx) +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm13 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm23, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512-NEXT: addq $200, %rsp +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 5ab0db77a9a06..1b508c764cc8c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -256,56 +256,56 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movaps 240(%rdi), %xmm5 ; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps 176(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm10 +; SSE-NEXT: movaps 112(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rdx) +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps %xmm14, (%rsi) +; SSE-NEXT: movaps %xmm12, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 48(%rdx) ; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm7, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rcx) -; SSE-NEXT: movaps %xmm10, 32(%rcx) -; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm6, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps %xmm13, 16(%rcx) ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%r8) @@ -469,103 +469,103 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm5 +; SSE-NEXT: movaps 384(%rdi), %xmm8 ; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm9 ; SSE-NEXT: movaps 480(%rdi), %xmm2 -; SSE-NEXT: movaps 448(%rdi), %xmm7 +; SSE-NEXT: movaps 448(%rdi), %xmm10 ; SSE-NEXT: movaps 224(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps 288(%rdi), %xmm4 ; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps 352(%rdi), %xmm9 +; SSE-NEXT: movaps 352(%rdi), %xmm5 ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 64(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 272(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movaps 336(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 400(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movaps 400(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -576,7 +576,8 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -599,22 +600,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm6, 96(%rcx) ; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm3, 112(%rcx) -; SSE-NEXT: movaps %xmm12, 48(%rcx) -; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps %xmm4, 112(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps %xmm9, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps %xmm2, 96(%r8) -; SSE-NEXT: movaps %xmm7, 80(%r8) -; SSE-NEXT: movaps %xmm10, 64(%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm5, (%r8) +; SSE-NEXT: movaps %xmm2, 112(%r8) +; SSE-NEXT: movaps %xmm5, 96(%r8) +; SSE-NEXT: movaps %xmm8, 80(%r8) +; SSE-NEXT: movaps %xmm7, 64(%r8) +; SSE-NEXT: movaps %xmm12, 48(%r8) +; SSE-NEXT: movaps %xmm11, 32(%r8) +; SSE-NEXT: movaps %xmm15, 16(%r8) +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; @@ -638,16 +638,16 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill @@ -655,14 +655,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] @@ -689,11 +689,11 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 @@ -765,16 +765,18 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 @@ -790,93 +792,92 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm5[1],ymm14[1],ymm5[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r8) -; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -959,15 +960,15 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm3 ; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm5 ; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps 288(%rdi), %xmm4 ; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 608(%rdi), %xmm3 +; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps 352(%rdi), %xmm6 ; SSE-NEXT: movaps 320(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm7 @@ -978,29 +979,29 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1009,9 +1010,9 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 ; SSE-NEXT: movaps 512(%rdi), %xmm1 @@ -1110,14 +1111,14 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 464(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 560(%rdi), %xmm0 ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1126,46 +1127,46 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 688(%rdi), %xmm0 ; SSE-NEXT: movaps 656(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 720(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 720(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 784(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 848(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps 912(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps 1008(%rdi), %xmm5 +; SSE-NEXT: movaps 912(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm4 ; SSE-NEXT: movaps 976(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps 16(%rdi), %xmm4 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1231,11 +1232,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm1, 240(%rcx) -; SSE-NEXT: movaps %xmm3, 224(%rcx) -; SSE-NEXT: movaps %xmm6, 208(%rcx) -; SSE-NEXT: movaps %xmm8, 192(%rcx) -; SSE-NEXT: movaps %xmm11, 176(%rcx) -; SSE-NEXT: movaps %xmm14, 160(%rcx) +; SSE-NEXT: movaps %xmm5, 224(%rcx) +; SSE-NEXT: movaps %xmm8, 208(%rcx) +; SSE-NEXT: movaps %xmm10, 192(%rcx) +; SSE-NEXT: movaps %xmm12, 176(%rcx) +; SSE-NEXT: movaps %xmm15, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1254,19 +1255,19 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: movaps %xmm2, 240(%r8) -; SSE-NEXT: movaps %xmm4, 224(%r8) +; SSE-NEXT: movaps %xmm3, 224(%r8) ; SSE-NEXT: movaps %xmm7, 208(%r8) ; SSE-NEXT: movaps %xmm9, 192(%r8) -; SSE-NEXT: movaps %xmm12, 176(%r8) +; SSE-NEXT: movaps %xmm11, 176(%r8) ; SSE-NEXT: movaps %xmm13, 160(%r8) -; SSE-NEXT: movaps %xmm15, 144(%r8) +; SSE-NEXT: movaps %xmm14, 144(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -1278,7 +1279,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm4, (%r8) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; @@ -1289,13 +1290,13 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 @@ -1313,17 +1314,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1393,11 +1394,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 @@ -1405,38 +1406,38 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 @@ -1460,54 +1461,54 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1588,10 +1589,10 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) @@ -1630,11 +1631,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm10, %ymm10 @@ -1654,7 +1655,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1670,186 +1671,188 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r8) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2320,7 +2323,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1040(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1136(%rdi), %xmm0 @@ -2371,55 +2374,55 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 1584(%rdi), %xmm0 ; SSE-NEXT: movaps 1552(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1648(%rdi), %xmm0 -; SSE-NEXT: movaps 1616(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 1616(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm0 -; SSE-NEXT: movaps 1680(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1680(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1776(%rdi), %xmm0 ; SSE-NEXT: movaps 1744(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1840(%rdi), %xmm0 -; SSE-NEXT: movaps 1808(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 1808(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 1904(%rdi), %xmm0 -; SSE-NEXT: movaps 1872(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 1968(%rdi), %xmm0 -; SSE-NEXT: movaps 1936(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 1872(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 2032(%rdi), %xmm13 -; SSE-NEXT: movaps 2000(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps 16(%rdi), %xmm13 +; SSE-NEXT: movaps 1968(%rdi), %xmm0 +; SSE-NEXT: movaps 1936(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 2032(%rdi), %xmm8 +; SSE-NEXT: movaps 2000(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps 16(%rdi), %xmm10 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2548,14 +2551,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 496(%rcx) -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps %xmm2, 464(%rcx) -; SSE-NEXT: movaps %xmm3, 448(%rcx) -; SSE-NEXT: movaps %xmm9, 432(%rcx) -; SSE-NEXT: movaps %xmm8, 416(%rcx) -; SSE-NEXT: movaps %xmm11, 400(%rcx) -; SSE-NEXT: movaps %xmm14, 384(%rcx) +; SSE-NEXT: movaps %xmm1, 496(%rcx) +; SSE-NEXT: movaps %xmm3, 480(%rcx) +; SSE-NEXT: movaps %xmm5, 464(%rcx) +; SSE-NEXT: movaps %xmm7, 448(%rcx) +; SSE-NEXT: movaps %xmm11, 432(%rcx) +; SSE-NEXT: movaps %xmm14, 416(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2570,7 +2575,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 288(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) @@ -2602,19 +2607,17 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm5, 496(%r8) -; SSE-NEXT: movaps %xmm6, 480(%r8) -; SSE-NEXT: movaps %xmm7, 464(%r8) -; SSE-NEXT: movaps %xmm10, 448(%r8) +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm2, 496(%r8) +; SSE-NEXT: movaps %xmm4, 480(%r8) +; SSE-NEXT: movaps %xmm6, 464(%r8) +; SSE-NEXT: movaps %xmm9, 448(%r8) ; SSE-NEXT: movaps %xmm12, 432(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 416(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%r8) +; SSE-NEXT: movaps %xmm13, 416(%r8) +; SSE-NEXT: movaps %xmm15, 400(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%r8) @@ -2660,13 +2663,13 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm13, (%r8) +; SSE-NEXT: movaps %xmm10, (%r8) ; SSE-NEXT: addq $1688, %rsp # imm = 0x698 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX1-ONLY-NEXT: subq $2728, %rsp # imm = 0xAA8 ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -2859,40 +2862,40 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 @@ -2907,417 +2910,421 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 320(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) -; AVX1-ONLY-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX1-ONLY-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3453,402 +3460,404 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm8[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) @@ -3859,188 +3868,188 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm29, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm27, %zmm29, %zmm6 +; AVX512-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm5 +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm4 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm21 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm27 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm18 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm10 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload @@ -4057,97 +4066,97 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload ; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm19[0,1,2,3],zmm23[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm3[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload -; AVX512-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload -; AVX512-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm16[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload ; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload ; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm29[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload -; AVX512-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm21[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm23, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm24, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index a1c9ad9e8ded6..b677dd7079434 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -361,69 +361,69 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd 224(%rdi), %xmm0 ; SSE-NEXT: movapd 256(%rdi), %xmm4 ; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 288(%rdi), %xmm6 -; SSE-NEXT: movapd 208(%rdi), %xmm5 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 272(%rdi), %xmm14 -; SSE-NEXT: movapd 160(%rdi), %xmm11 +; SSE-NEXT: movapd 288(%rdi), %xmm7 +; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd (%rdi), %xmm9 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd 32(%rdi), %xmm14 +; SSE-NEXT: movapd 48(%rdi), %xmm8 +; SSE-NEXT: movapd 240(%rdi), %xmm11 +; SSE-NEXT: movapd 272(%rdi), %xmm13 +; SSE-NEXT: movapd 160(%rdi), %xmm10 ; SSE-NEXT: movapd 192(%rdi), %xmm15 ; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm5[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm8[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm9[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm13, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 80(%rdi), %xmm13 ; SSE-NEXT: movapd 112(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] ; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] ; SSE-NEXT: movapd 96(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm13, 48(%rsi) +; SSE-NEXT: movapd 144(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm14, 48(%rsi) ; SSE-NEXT: movapd %xmm15, (%rsi) ; SSE-NEXT: movapd %xmm12, 32(%rsi) -; SSE-NEXT: movapd %xmm14, 16(%rdx) -; SSE-NEXT: movapd %xmm10, 48(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) -; SSE-NEXT: movapd %xmm11, 32(%rdx) +; SSE-NEXT: movapd %xmm13, 16(%rdx) +; SSE-NEXT: movapd %xmm11, 48(%rdx) +; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm10, 32(%rdx) ; SSE-NEXT: movapd %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm6, 48(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm5, 32(%rcx) +; SSE-NEXT: movapd %xmm7, 48(%rcx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm6, 32(%rcx) ; SSE-NEXT: movapd %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) +; SSE-NEXT: movapd %xmm5, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm3, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -712,15 +712,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movapd 224(%rdi), %xmm1 +; SSE-NEXT: movapd 224(%rdi), %xmm3 ; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 64(%rdi), %xmm1 ; SSE-NEXT: movapd 176(%rdi), %xmm4 ; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm7 ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm6 ; SSE-NEXT: movapd 32(%rdi), %xmm14 ; SSE-NEXT: movapd 48(%rdi), %xmm9 ; SSE-NEXT: movapd 160(%rdi), %xmm11 @@ -732,15 +732,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] @@ -752,14 +752,14 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm13, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -776,13 +776,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 320(%rdi), %xmm14 +; SSE-NEXT: movapd 320(%rdi), %xmm15 ; SSE-NEXT: movapd 352(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 368(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movapd 336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -793,65 +793,65 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 400(%rdi), %xmm11 ; SSE-NEXT: movapd 432(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd 448(%rdi), %xmm12 ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm12[0] -; SSE-NEXT: movapd 416(%rdi), %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] +; SSE-NEXT: movapd 416(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] ; SSE-NEXT: movapd 464(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 480(%rdi), %xmm2 -; SSE-NEXT: movapd 512(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: movapd 528(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm3[0] -; SSE-NEXT: movapd 496(%rdi), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] -; SSE-NEXT: movapd 544(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: movapd 560(%rdi), %xmm5 -; SSE-NEXT: movapd 592(%rdi), %xmm10 -; SSE-NEXT: movapd %xmm10, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: movapd 608(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] -; SSE-NEXT: movapd 576(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 624(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] -; SSE-NEXT: movapd %xmm7, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 32(%rsi) -; SSE-NEXT: movapd %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm15, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) +; SSE-NEXT: movapd 480(%rdi), %xmm4 +; SSE-NEXT: movapd 512(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd 528(%rdi), %xmm7 +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] +; SSE-NEXT: movapd 496(%rdi), %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd 544(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 560(%rdi), %xmm0 +; SSE-NEXT: movapd 592(%rdi), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 608(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 576(%rdi), %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 624(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movapd %xmm6, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rsi) +; SSE-NEXT: movapd %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movapd %xmm14, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movapd %xmm13, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm4, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) -; SSE-NEXT: movapd %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movapd %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movapd %xmm15, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm7, 96(%rcx) +; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movapd %xmm12, 80(%rcx) @@ -863,9 +863,9 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%r8) -; SSE-NEXT: movapd %xmm8, 96(%r8) -; SSE-NEXT: movapd %xmm13, 80(%r8) +; SSE-NEXT: movapd %xmm3, 112(%r8) +; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm14, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -876,8 +876,8 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm4, 112(%r9) -; SSE-NEXT: movapd %xmm9, 96(%r9) +; SSE-NEXT: movapd %xmm8, 112(%r9) +; SSE-NEXT: movapd %xmm10, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -895,174 +895,174 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm6[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[3],ymm15[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm9[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm12[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%r8) +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%r9) -; AVX1-ONLY-NEXT: addq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1071,18 +1071,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 @@ -1090,147 +1090,147 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1446,54 +1446,54 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $920, %rsp # imm = 0x398 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 144(%rdi), %xmm1 -; SSE-NEXT: movapd 64(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 208(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd (%rdi), %xmm9 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm9 +; SSE-NEXT: movapd 128(%rdi), %xmm10 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm1 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm13 +; SSE-NEXT: movapd 192(%rdi), %xmm0 +; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -1639,91 +1639,91 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 960(%rdi), %xmm10 -; SSE-NEXT: movapd 992(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: movapd 992(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] ; SSE-NEXT: movapd 1008(%rdi), %xmm15 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm15[0] ; SSE-NEXT: movapd 976(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: movapd 1024(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movapd 1024(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1040(%rdi), %xmm8 -; SSE-NEXT: movapd 1072(%rdi), %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 1088(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movapd 1056(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd 1104(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1120(%rdi), %xmm1 -; SSE-NEXT: movapd 1152(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd 1168(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm6[0] -; SSE-NEXT: movapd 1136(%rdi), %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm11[0],xmm6[1] -; SSE-NEXT: movapd 1184(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 1040(%rdi), %xmm8 +; SSE-NEXT: movapd 1072(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movapd 1088(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm11[0] +; SSE-NEXT: movapd 1056(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movapd 1104(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd 1120(%rdi), %xmm5 +; SSE-NEXT: movapd 1152(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd 1168(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] +; SSE-NEXT: movapd 1136(%rdi), %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] +; SSE-NEXT: movapd 1184(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1200(%rdi), %xmm0 -; SSE-NEXT: movapd 1232(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd 1248(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] +; SSE-NEXT: movapd 1232(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 1248(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm4[0] ; SSE-NEXT: movapd 1216(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd 1264(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm5, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 240(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 1264(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd %xmm3, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 176(%rsi) +; SSE-NEXT: movaps %xmm2, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movapd %xmm14, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm1, 224(%rdx) -; SSE-NEXT: movapd %xmm0, 240(%rdx) -; SSE-NEXT: movapd %xmm10, 192(%rdx) -; SSE-NEXT: movapd %xmm8, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm9, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm5, 224(%rdx) +; SSE-NEXT: movapd %xmm0, 240(%rdx) +; SSE-NEXT: movapd %xmm10, 192(%rdx) +; SSE-NEXT: movapd %xmm8, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) @@ -1743,9 +1743,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) +; SSE-NEXT: movapd %xmm4, 240(%rcx) ; SSE-NEXT: movapd %xmm6, 224(%rcx) -; SSE-NEXT: movapd %xmm9, 208(%rcx) +; SSE-NEXT: movapd %xmm11, 208(%rcx) ; SSE-NEXT: movapd %xmm15, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rcx) @@ -1772,7 +1772,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 240(%r8) -; SSE-NEXT: movapd %xmm11, 224(%r8) +; SSE-NEXT: movapd %xmm12, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1801,7 +1801,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm12, 240(%r9) +; SSE-NEXT: movapd %xmm13, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1837,479 +1837,474 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: subq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm6[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm1[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm2[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm15[0],ymm6[0],ymm15[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm12[0],ymm2[3],ymm12[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm11[0],ymm8[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm2[0],ymm8[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 160(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r9) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2317,76 +2312,78 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm14 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2447,582 +2444,572 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r9) +; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm30, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <160 x i64>, ptr %in.vec, align 64 @@ -3042,55 +3029,55 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i64_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2200, %rsp # imm = 0x898 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 144(%rdi), %xmm1 -; SSE-NEXT: movapd 64(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 208(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd (%rdi), %xmm9 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: subq $2216, %rsp # imm = 0x8A8 +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 128(%rdi), %xmm9 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm0 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm13 +; SSE-NEXT: movapd 192(%rdi), %xmm1 +; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm9[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm10[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -3450,7 +3437,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2048(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 2016(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3477,24 +3464,24 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2160(%rdi), %xmm14 ; SSE-NEXT: movapd 2192(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm14[0],xmm13[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] ; SSE-NEXT: movapd 2208(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movapd 2176(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2224(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2240(%rdi), %xmm11 +; SSE-NEXT: movapd 2240(%rdi), %xmm12 ; SSE-NEXT: movapd 2272(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm12[0],xmm8[1] ; SSE-NEXT: movapd 2288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movapd 2256(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3503,110 +3490,111 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2320(%rdi), %xmm7 -; SSE-NEXT: movapd 2352(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movapd 2320(%rdi), %xmm9 +; SSE-NEXT: movapd 2352(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] ; SSE-NEXT: movapd 2368(%rdi), %xmm15 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm15[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] ; SSE-NEXT: movapd 2336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: movapd 2384(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movapd 2384(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2400(%rdi), %xmm6 -; SSE-NEXT: movapd 2432(%rdi), %xmm10 -; SSE-NEXT: movapd %xmm10, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; SSE-NEXT: movapd 2448(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm12[0] -; SSE-NEXT: movapd 2416(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: movapd 2464(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2480(%rdi), %xmm0 -; SSE-NEXT: movapd 2512(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: movapd 2400(%rdi), %xmm7 +; SSE-NEXT: movapd 2432(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 2448(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] +; SSE-NEXT: movapd 2416(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd 2464(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd 2528(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm8[0] -; SSE-NEXT: movapd 2496(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] -; SSE-NEXT: movapd 2544(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm1, 496(%rsi) -; SSE-NEXT: movapd %xmm3, 480(%rsi) -; SSE-NEXT: movapd %xmm5, 464(%rsi) -; SSE-NEXT: movapd %xmm9, 448(%rsi) -; SSE-NEXT: movapd %xmm13, 432(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm0, 496(%rdx) -; SSE-NEXT: movapd %xmm6, 480(%rdx) -; SSE-NEXT: movapd %xmm7, 464(%rdx) -; SSE-NEXT: movapd %xmm11, 448(%rdx) +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 2480(%rdi), %xmm5 +; SSE-NEXT: movapd 2512(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd 2528(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm10[0] +; SSE-NEXT: movapd 2496(%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 2544(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm2, 496(%rsi) +; SSE-NEXT: movapd %xmm4, 480(%rsi) +; SSE-NEXT: movapd %xmm6, 464(%rsi) +; SSE-NEXT: movapd %xmm8, 448(%rsi) +; SSE-NEXT: movapd %xmm11, 432(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movapd %xmm5, 496(%rdx) +; SSE-NEXT: movapd %xmm7, 480(%rdx) +; SSE-NEXT: movapd %xmm9, 464(%rdx) +; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movapd %xmm14, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rdx) @@ -3658,12 +3646,12 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movapd %xmm8, 496(%rcx) -; SSE-NEXT: movapd %xmm12, 480(%rcx) +; SSE-NEXT: movapd %xmm10, 496(%rcx) +; SSE-NEXT: movapd %xmm13, 480(%rcx) ; SSE-NEXT: movapd %xmm15, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rcx) @@ -3719,7 +3707,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm10, 496(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3782,7 +3771,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm2, 496(%r9) +; SSE-NEXT: movapd %xmm1, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3845,215 +3834,213 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: addq $2200, %rsp # imm = 0x898 +; SSE-NEXT: addq $2216, %rsp # imm = 0x8A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3288, %rsp # imm = 0xCD8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $3256, %rsp # imm = 0xCB8 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm10[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2080(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm11[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm14[0],ymm2[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] @@ -4062,43 +4049,38 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm1 @@ -4107,22 +4089,26 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 @@ -4147,223 +4133,222 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 1776(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 2096(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm7[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] @@ -4373,52 +4358,52 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[3],ymm6[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2] ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %ymm14 @@ -4428,7 +4413,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm11 @@ -4472,8 +4457,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 2528(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4698,7 +4682,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) @@ -4734,21 +4718,21 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $3288, %rsp # imm = 0xCD8 +; AVX1-ONLY-NEXT: addq $3256, %rsp # imm = 0xCB8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3240, %rsp # imm = 0xCA8 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -4761,317 +4745,316 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm15[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5187,8 +5170,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5204,11 +5187,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5219,22 +5202,23 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] @@ -5248,14 +5232,14 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] @@ -5270,7 +5254,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] @@ -5278,15 +5262,16 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] @@ -5321,87 +5306,87 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] @@ -5537,22 +5522,22 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 448(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 384(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 352(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 256(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 448(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 416(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 384(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 320(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 288(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 256(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $3240, %rsp # imm = 0xCA8 @@ -5561,949 +5546,937 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 384(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 05f07039fd67e..ffdee1430c711 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -402,109 +402,109 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps 208(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm12 -; SSE-NEXT: movaps 304(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm15 -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm5 -; SSE-NEXT: movaps 336(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] +; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps 320(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm13, 16(%rsi) -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rcx) -; SSE-NEXT: movaps %xmm12, 32(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm12, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps %xmm14, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm13, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r9) -; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: movaps %xmm6, 48(%r9) -; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; @@ -597,54 +597,54 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm8[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm11[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] @@ -656,20 +656,20 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) @@ -682,80 +682,80 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-LABEL: load_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <0,6,12,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: movb $24, %dil -; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} -; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-NEXT: movb $-32, %dil +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) @@ -767,80 +767,80 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,6,12,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $24, %dil -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} -; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: movb $-32, %dil +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) @@ -949,7 +949,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm0 @@ -958,25 +958,26 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 496(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 640(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 688(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -991,26 +992,25 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 464(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 416(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 560(%rdi), %xmm0 -; SSE-NEXT: movaps 512(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps 512(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -1063,16 +1063,17 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm12, 96(%r8) -; SSE-NEXT: movaps %xmm14, 80(%r8) +; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps %xmm15, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -1085,10 +1086,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 112(%r9) ; SSE-NEXT: movaps %xmm4, 96(%r9) ; SSE-NEXT: movaps %xmm7, 80(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) -; SSE-NEXT: movaps %xmm15, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm8, 64(%r9) +; SSE-NEXT: movaps %xmm10, 48(%r9) +; SSE-NEXT: movaps %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1096,10 +1096,10 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm3, 96(%rax) -; SSE-NEXT: movaps %xmm5, 80(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm11, 32(%rax) +; SSE-NEXT: movaps %xmm6, 80(%rax) +; SSE-NEXT: movaps %xmm5, 64(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movaps %xmm12, 32(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -1158,109 +1158,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm7[0],ymm15[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm7[1],ymm15[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm8[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) @@ -1294,7 +1294,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1305,221 +1305,222 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm15[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm15[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm8[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) +; AVX2-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1830,15 +1831,15 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movaps 624(%rdi), %xmm0 ; SSE-NEXT: movaps 576(%rdi), %xmm9 -; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps 720(%rdi), %xmm1 ; SSE-NEXT: movaps 672(%rdi), %xmm11 -; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps 336(%rdi), %xmm5 ; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps 432(%rdi), %xmm5 +; SSE-NEXT: movaps 432(%rdi), %xmm4 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 912(%rdi), %xmm3 +; SSE-NEXT: movaps 912(%rdi), %xmm2 ; SSE-NEXT: movaps 528(%rdi), %xmm6 ; SSE-NEXT: movaps 480(%rdi), %xmm14 ; SSE-NEXT: movaps 144(%rdi), %xmm7 @@ -1849,29 +1850,29 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1880,9 +1881,9 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 768(%rdi), %xmm1 @@ -2084,7 +2085,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 416(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 560(%rdi), %xmm0 @@ -2093,7 +2094,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2102,23 +2103,23 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 704(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 704(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 848(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps 896(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1040(%rdi), %xmm0 ; SSE-NEXT: movaps 992(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm12 @@ -2293,7 +2294,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -2310,12 +2311,12 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, 192(%rax) ; SSE-NEXT: movaps %xmm9, 176(%rax) ; SSE-NEXT: movaps %xmm10, 160(%rax) -; SSE-NEXT: movaps %xmm14, 144(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) +; SSE-NEXT: movaps %xmm14, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -2506,26 +2507,26 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] @@ -2574,8 +2575,8 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] @@ -2585,7 +2586,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2597,13 +2598,13 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -2720,409 +2721,412 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride6_vf32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride6_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm5[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm4[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm11[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 @@ -3130,40 +3134,40 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 @@ -3171,486 +3175,490 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm13[1],xmm14[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm11[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 904(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX2-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 ; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3658,347 +3666,351 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4586,7 +4598,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1568(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1712(%rdi), %xmm0 @@ -4630,55 +4642,55 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 2288(%rdi), %xmm0 -; SSE-NEXT: movaps 2240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 2240(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 2384(%rdi), %xmm0 -; SSE-NEXT: movaps 2336(%rdi), %xmm14 +; SSE-NEXT: movaps 2336(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 2480(%rdi), %xmm0 +; SSE-NEXT: movaps 2432(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 2480(%rdi), %xmm0 -; SSE-NEXT: movaps 2432(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 2576(%rdi), %xmm0 -; SSE-NEXT: movaps 2528(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 2528(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 2672(%rdi), %xmm0 ; SSE-NEXT: movaps 2624(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 2768(%rdi), %xmm0 -; SSE-NEXT: movaps 2720(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps 2720(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 2864(%rdi), %xmm0 -; SSE-NEXT: movaps 2816(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 2960(%rdi), %xmm0 -; SSE-NEXT: movaps 2912(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 2864(%rdi), %xmm0 +; SSE-NEXT: movaps 2816(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 2960(%rdi), %xmm0 +; SSE-NEXT: movaps 2912(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 3056(%rdi), %xmm0 -; SSE-NEXT: movaps 3008(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 3008(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4935,13 +4947,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps %xmm1, 496(%r9) -; SSE-NEXT: movaps %xmm3, 480(%r9) -; SSE-NEXT: movaps %xmm6, 464(%r9) +; SSE-NEXT: movaps %xmm2, 496(%r9) +; SSE-NEXT: movaps %xmm4, 480(%r9) +; SSE-NEXT: movaps %xmm7, 464(%r9) ; SSE-NEXT: movaps %xmm8, 448(%r9) -; SSE-NEXT: movaps %xmm10, 432(%r9) -; SSE-NEXT: movaps %xmm12, 416(%r9) -; SSE-NEXT: movaps %xmm15, 400(%r9) +; SSE-NEXT: movaps %xmm11, 432(%r9) +; SSE-NEXT: movaps %xmm13, 416(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4958,7 +4971,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 288(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r9) @@ -4993,17 +5006,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 496(%rax) -; SSE-NEXT: movaps %xmm4, 480(%rax) -; SSE-NEXT: movaps %xmm5, 464(%rax) -; SSE-NEXT: movaps %xmm7, 448(%rax) +; SSE-NEXT: movaps %xmm1, 496(%rax) +; SSE-NEXT: movaps %xmm3, 480(%rax) +; SSE-NEXT: movaps %xmm6, 464(%rax) +; SSE-NEXT: movaps %xmm5, 448(%rax) ; SSE-NEXT: movaps %xmm9, 432(%rax) -; SSE-NEXT: movaps %xmm11, 416(%rax) -; SSE-NEXT: movaps %xmm13, 400(%rax) -; SSE-NEXT: movaps %xmm14, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, 416(%rax) +; SSE-NEXT: movaps %xmm14, 400(%rax) +; SSE-NEXT: movaps %xmm12, 384(%rax) +; SSE-NEXT: movaps %xmm15, 368(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) @@ -5511,11 +5523,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 @@ -5639,13 +5651,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -5919,297 +5931,297 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride6_vf64: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride6_vf64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 @@ -6218,11 +6230,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 @@ -6231,11 +6243,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 @@ -6268,9 +6280,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 @@ -6279,16 +6292,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm5 @@ -6403,13 +6416,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm9[1],mem[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] @@ -6417,8 +6431,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm6[1] @@ -6517,7 +6530,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6534,11 +6547,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 @@ -6562,63 +6575,66 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm3[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6626,17 +6642,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6646,7 +6664,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6666,7 +6684,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6686,10 +6704,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6713,7 +6730,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -6734,140 +6751,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 2768(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %xmm3 @@ -6894,7 +6915,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6956,38 +6977,38 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7149,10 +7170,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 288(%rax) @@ -7173,596 +7194,599 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX2-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm17 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -7788,146 +7812,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7935,589 +7957,592 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -8543,146 +8568,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 70b718cd7f282..9841686a648a5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -531,16 +531,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movapd 320(%rdi), %xmm0 -; SSE-NEXT: movapd 208(%rdi), %xmm1 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 144(%rdi), %xmm3 -; SSE-NEXT: movapd 304(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm5 -; SSE-NEXT: movapd 240(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd 288(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 +; SSE-NEXT: movapd 320(%rdi), %xmm1 +; SSE-NEXT: movapd 208(%rdi), %xmm0 +; SSE-NEXT: movapd 256(%rdi), %xmm3 +; SSE-NEXT: movapd 144(%rdi), %xmm2 +; SSE-NEXT: movapd 304(%rdi), %xmm5 +; SSE-NEXT: movapd 192(%rdi), %xmm4 +; SSE-NEXT: movapd 240(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 +; SSE-NEXT: movapd 288(%rdi), %xmm9 +; SSE-NEXT: movapd 176(%rdi), %xmm8 ; SSE-NEXT: movapd 336(%rdi), %xmm10 ; SSE-NEXT: movapd 224(%rdi), %xmm11 ; SSE-NEXT: movapd 272(%rdi), %xmm14 @@ -549,21 +549,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm15, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] -; SSE-NEXT: movapd %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm8[0] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] @@ -571,49 +557,63 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 384(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm9[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 384(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] ; SSE-NEXT: movapd 400(%rdi), %xmm7 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 352(%rdi), %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] -; SSE-NEXT: movapd 416(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movapd 368(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] -; SSE-NEXT: movapd 432(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm11[0] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] -; SSE-NEXT: movapd (%rdi), %xmm5 -; SSE-NEXT: movapd 48(%rdi), %xmm12 -; SSE-NEXT: movapd %xmm12, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd 416(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm10[0] +; SSE-NEXT: movapd 368(%rdi), %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] +; SSE-NEXT: movapd 432(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm14[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movapd 64(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd 16(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 80(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] -; SSE-NEXT: movapd 32(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm14, 48(%rsi) -; SSE-NEXT: movapd %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movapd 80(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm11, 48(%rsi) +; SSE-NEXT: movapd %xmm12, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rdx) ; SSE-NEXT: movapd %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 48(%rcx) @@ -627,22 +627,22 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm2, (%r9) -; SSE-NEXT: movapd %xmm9, 48(%r9) +; SSE-NEXT: movapd %xmm4, (%r9) +; SSE-NEXT: movapd %xmm10, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: movapd %xmm10, 48(%rax) +; SSE-NEXT: movapd %xmm5, (%rax) +; SSE-NEXT: movapd %xmm15, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) -; SSE-NEXT: movapd %xmm11, 48(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm14, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -652,7 +652,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 @@ -663,7 +663,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -677,7 +677,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] @@ -688,11 +688,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm9[0],ymm11[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 @@ -706,8 +706,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] @@ -738,12 +738,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rax) @@ -756,9 +756,9 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 @@ -783,7 +783,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] @@ -792,20 +792,20 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 @@ -814,8 +814,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] @@ -824,17 +824,17 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -842,129 +842,129 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5] +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 ; AVX512F-NEXT: movb $-32, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k2} +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11] -; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 {%k2} +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -972,110 +972,110 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: movb $24, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: kmovd %r11d, %k2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: movb $-32, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k2} +; AVX512BW-NEXT: kmovd %r11d, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <56 x i64>, ptr %in.vec, align 64 @@ -1100,58 +1100,58 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $536, %rsp # imm = 0x218 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1170,7 +1170,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 384(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 400(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1211,85 +1211,85 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 560(%rdi), %xmm13 -; SSE-NEXT: movapd 608(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 608(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 624(%rdi), %xmm14 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm14[0] -; SSE-NEXT: movapd 576(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movapd 576(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd 640(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 592(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 592(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 656(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movapd 656(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 672(%rdi), %xmm2 -; SSE-NEXT: movapd 720(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: movapd 736(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm6[0] -; SSE-NEXT: movapd 688(%rdi), %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 720(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movapd 736(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] +; SSE-NEXT: movapd 688(%rdi), %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: movapd 752(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0] ; SSE-NEXT: movapd 704(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] -; SSE-NEXT: movapd 768(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 784(%rdi), %xmm4 -; SSE-NEXT: movapd 832(%rdi), %xmm11 -; SSE-NEXT: movapd %xmm11, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd 848(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0] -; SSE-NEXT: movapd 800(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 864(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd 816(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd 880(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] -; SSE-NEXT: movapd %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) -; SSE-NEXT: movapd %xmm5, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) +; SSE-NEXT: movapd 768(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 784(%rdi), %xmm0 +; SSE-NEXT: movapd 832(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 848(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 800(%rdi), %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 864(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 816(%rdi), %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1] +; SSE-NEXT: movapd 880(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] +; SSE-NEXT: movapd %xmm7, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movapd %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm4, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm13, 80(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm6, 96(%rcx) -; SSE-NEXT: movapd %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movapd %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm13, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm8, 96(%rcx) +; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movapd %xmm14, 80(%rcx) @@ -1301,8 +1301,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%r8) -; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm3, 112(%r8) +; SSE-NEXT: movapd %xmm10, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1315,7 +1315,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm5, 112(%r9) ; SSE-NEXT: movapd %xmm12, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) @@ -1330,9 +1330,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm7, 112(%rax) +; SSE-NEXT: movapd %xmm9, 112(%rax) ; SSE-NEXT: movapd %xmm15, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -1345,7 +1345,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm10, 112(%rax) +; SSE-NEXT: movapd %xmm11, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1423,182 +1423,182 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm15[0],ymm5[2],ymm15[2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[1],ymm11[0],ymm9[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm2[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm3[0],ymm10[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -1626,9 +1626,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -1703,8 +1702,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1713,7 +1712,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm0[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] @@ -1726,58 +1726,57 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm2[1],ymm14[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1802,7 +1801,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1810,21 +1810,21 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rax) -; AVX2-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2221,58 +2221,58 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1448, %rsp # imm = 0x5A8 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2485,13 +2485,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1344(%rdi), %xmm13 +; SSE-NEXT: movapd 1344(%rdi), %xmm14 ; SSE-NEXT: movapd 1392(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1408(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movapd 1360(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2507,30 +2507,30 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1456(%rdi), %xmm9 -; SSE-NEXT: movapd 1504(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] -; SSE-NEXT: movapd 1520(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] +; SSE-NEXT: movapd 1504(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: movapd 1520(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm13[0] ; SSE-NEXT: movapd 1472(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] -; SSE-NEXT: movapd 1536(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd 1536(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1488(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1552(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1552(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1568(%rdi), %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1568(%rdi), %xmm6 ; SSE-NEXT: movapd 1616(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd 1632(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] ; SSE-NEXT: movapd 1584(%rdi), %xmm11 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1] ; SSE-NEXT: movapd 1648(%rdi), %xmm1 @@ -2543,54 +2543,54 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1680(%rdi), %xmm1 -; SSE-NEXT: movapd 1728(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd 1744(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd 1696(%rdi), %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: movapd 1680(%rdi), %xmm3 +; SSE-NEXT: movapd 1728(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 1744(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 1696(%rdi), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd 1760(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] ; SSE-NEXT: movapd 1712(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] ; SSE-NEXT: movapd 1776(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm7, 224(%rsi) +; SSE-NEXT: movapd %xmm4, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm14, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm5, 224(%rdx) -; SSE-NEXT: movapd %xmm1, 240(%rdx) -; SSE-NEXT: movapd %xmm13, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm12, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 224(%rdx) +; SSE-NEXT: movapd %xmm3, 240(%rdx) +; SSE-NEXT: movapd %xmm14, 192(%rdx) ; SSE-NEXT: movapd %xmm9, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) @@ -2616,9 +2616,9 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) +; SSE-NEXT: movapd %xmm5, 240(%rcx) ; SSE-NEXT: movapd %xmm8, 224(%rcx) -; SSE-NEXT: movapd %xmm12, 208(%rcx) +; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2645,7 +2645,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm6, 240(%r8) +; SSE-NEXT: movapd %xmm7, 240(%r8) ; SSE-NEXT: movapd %xmm11, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) @@ -2776,4519 +2776,4577 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1720, %rsp # imm = 0x6B8 +; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm7[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm3[0],ymm8[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm10[0],ymm15[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] ; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[1],ymm8[0],ymm12[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[1],ymm5[0],ymm11[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[1],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[1],ymm1[0],ymm13[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm1[0],ymm10[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm2[0],ymm6[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm12[0],xmm8[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm11[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm0[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0],ymm4[0],ymm2[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm8[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rax) -; AVX1-ONLY-NEXT: addq $1720, %rsp # imm = 0x6B8 +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rax) +; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm8[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm14[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $24, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $24, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $-32, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $-32, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $24, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512DQ-FAST-NEXT: movb $-32, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $24, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512DQBW-SLOW-NEXT: movb $-32, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $24, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512DQBW-FAST-NEXT: movb $-32, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 @@ -7313,58 +7371,58 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $3240, %rsp # imm = 0xCA8 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7870,7 +7928,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2864(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2816(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7907,16 +7965,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3024(%rdi), %xmm1 +; SSE-NEXT: movapd 3024(%rdi), %xmm2 ; SSE-NEXT: movapd 3072(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd 3088(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd 3088(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3040(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3104(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7950,15 +8008,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3248(%rdi), %xmm9 ; SSE-NEXT: movapd 3296(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] -; SSE-NEXT: movapd 3312(%rdi), %xmm14 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm14[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd 3312(%rdi), %xmm15 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] ; SSE-NEXT: movapd 3264(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 3328(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 3280(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7967,48 +8025,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3360(%rdi), %xmm5 -; SSE-NEXT: movapd 3408(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movapd 3360(%rdi), %xmm6 +; SSE-NEXT: movapd 3408(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd 3424(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm11[0] -; SSE-NEXT: movapd 3376(%rdi), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd 3440(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3392(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3456(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3472(%rdi), %xmm2 -; SSE-NEXT: movapd 3520(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm11[0] +; SSE-NEXT: movapd 3376(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movapd 3440(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3392(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3456(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3472(%rdi), %xmm5 +; SSE-NEXT: movapd 3520(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd 3536(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] ; SSE-NEXT: movapd 3488(%rdi), %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] ; SSE-NEXT: movapd 3552(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] -; SSE-NEXT: movapd 3504(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd 3504(%rdi), %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: movapd 3568(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movapd 3568(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: movapd %xmm1, 496(%rsi) -; SSE-NEXT: movapd %xmm7, 480(%rsi) -; SSE-NEXT: movapd %xmm6, 464(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm2, 496(%rsi) +; SSE-NEXT: movapd %xmm4, 480(%rsi) +; SSE-NEXT: movapd %xmm7, 464(%rsi) ; SSE-NEXT: movapd %xmm10, 448(%rsi) -; SSE-NEXT: movapd %xmm15, 432(%rsi) +; SSE-NEXT: movapd %xmm14, 432(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8017,61 +8075,61 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 384(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm2, 496(%rdx) -; SSE-NEXT: movapd %xmm5, 480(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movapd %xmm5, 496(%rdx) +; SSE-NEXT: movapd %xmm6, 480(%rdx) ; SSE-NEXT: movapd %xmm9, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rdx) @@ -8125,7 +8183,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movapd %xmm8, 496(%rcx) ; SSE-NEXT: movapd %xmm11, 480(%rcx) -; SSE-NEXT: movapd %xmm14, 464(%rcx) +; SSE-NEXT: movapd %xmm15, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8187,7 +8245,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm13, 496(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r8) @@ -8377,7 +8435,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, 496(%rax) +; SSE-NEXT: movapd %xmm1, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8445,743 +8503,749 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4264, %rsp # imm = 0x10A8 -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: subq $4232, %rsp # imm = 0x1088 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2016(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2784(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2688(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 3136(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm13[0],ymm5[0],ymm13[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm8[0],ymm15[3],ymm8[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm0[0],ymm13[0],ymm0[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm11[0],ymm0[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm9[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[1],ymm14[0],ymm13[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm11[1],ymm13[0],ymm11[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[1],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm15[0],ymm0[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm14[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm10[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 3040(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9190,36 +9254,38 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 @@ -9228,169 +9294,168 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[3],ymm13[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm10[0],ymm4[3],ymm10[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] @@ -9552,7 +9617,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) @@ -9599,15 +9664,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 384(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 320(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 288(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9618,63 +9683,62 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4264, %rsp # imm = 0x10A8 +; AVX1-ONLY-NEXT: addq $4232, %rsp # imm = 0x1088 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3976, %rsp # imm = 0xF88 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $3928, %rsp # imm = 0xF58 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2064(%rdi), %xmm1 @@ -9682,221 +9746,220 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3360(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9907,116 +9970,120 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 3376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1024(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -10040,213 +10107,207 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -10255,520 +10316,525 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2944(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 384(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 288(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 480(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 448(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 384(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 352(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 320(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 288(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3976, %rsp # imm = 0xF88 +; AVX2-ONLY-NEXT: addq $3928, %rsp # imm = 0xF58 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -10776,43 +10842,43 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -10820,841 +10886,832 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm31 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm7 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm26 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm8 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm31 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm15, %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm20, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm23 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm20 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm22 ; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm19 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm11[4,5,4,5],zmm6[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm29 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm13[4,5,4,5],zmm18[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm16[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm7[4,5,4,5],zmm21[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[4,5,4,5],zmm23[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm30[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm13, %zmm10, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm23, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm14, %zmm14 -; AVX512F-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm14 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%rsi) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm13 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) @@ -11694,7 +11751,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) @@ -11702,54 +11759,55 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm28 +; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -11757,688 +11815,664 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm6 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm11 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm21 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u> ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm24 -; AVX512BW-NEXT: movb $24, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm8[4,5,4,5],zmm26[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm16[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm13[4,5,4,5],zmm17[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm12[4,5,4,5],zmm0[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[4,5,4,5],zmm21[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -12448,13 +12482,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12475,170 +12503,179 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm23, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm19 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm20 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm21 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm22 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm23 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm23 = mem[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm23, %xmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm23, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm25 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm24 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12675,7 +12712,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) @@ -12687,9 +12724,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index b84fff274cd71..7d29ed1192bd4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -167,69 +167,69 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i64_stride8_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 208(%rdi), %xmm15 ; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm15 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movaps 176(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm15, (%rsi) -; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm14, 16(%rcx) -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) -; SSE-NEXT: movaps %xmm12, 16(%r9) +; SSE-NEXT: movaps %xmm11, (%r9) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf4: @@ -419,112 +419,112 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride8_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 336(%rdi), %xmm11 -; SSE-NEXT: movaps 464(%rdi), %xmm6 -; SSE-NEXT: movaps 400(%rdi), %xmm7 -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm8 -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps 256(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 400(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps 320(%rdi), %xmm4 +; SSE-NEXT: movaps 256(%rdi), %xmm11 +; SSE-NEXT: movaps 448(%rdi), %xmm5 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps (%rdi), %xmm13 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 272(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdi), %xmm0 -; SSE-NEXT: movaps 160(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps 288(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm1 -; SSE-NEXT: movaps 416(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) @@ -547,27 +547,27 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm11, 32(%r9) -; SSE-NEXT: movaps %xmm8, 48(%r9) +; SSE-NEXT: movaps %xmm12, 32(%r9) +; SSE-NEXT: movaps %xmm11, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 48(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm12, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm5, 32(%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm10, (%rax) +; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movaps %xmm5, 16(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; @@ -602,17 +602,17 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] @@ -630,79 +630,79 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %xmm11, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -713,13 +713,13 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 @@ -736,52 +736,52 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm2[1],ymm15[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) @@ -798,11 +798,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) @@ -815,229 +815,229 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i64_stride8_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm12 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] ; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm12 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -1205,29 +1205,31 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 608(%rdi), %xmm0 -; SSE-NEXT: movaps 544(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 544(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 672(%rdi), %xmm12 +; SSE-NEXT: movaps 672(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm9 +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps 928(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps 928(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1236,43 +1238,41 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 432(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 432(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 624(%rdi), %xmm0 ; SSE-NEXT: movaps 560(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 688(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps 1008(%rdi), %xmm0 -; SSE-NEXT: movaps 944(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 944(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1354,10 +1354,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 112(%rax) -; SSE-NEXT: movaps %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1367,26 +1369,24 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm6, 96(%rax) -; SSE-NEXT: movaps %xmm8, 80(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps %xmm6, 80(%rax) ; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1394,7 +1394,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $792, %rsp # imm = 0x318 +; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1524,13 +1524,13 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1543,239 +1543,239 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) -; AVX1-ONLY-NEXT: addq $792, %rsp # imm = 0x318 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride8_vf16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride8_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm12 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 @@ -1812,50 +1812,50 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) @@ -1907,7 +1907,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1918,72 +1918,72 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX2-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride8_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $200, %rsp -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm28 +; AVX512F-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 ; AVX512F-NEXT: vmovaps 640(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512F-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm24 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm31, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm31, %zmm15 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19 -; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 +; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -1991,54 +1991,54 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm24 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm6 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm28, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2046,163 +2046,166 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7] -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm21, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm16 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm21, %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm11 ; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 ; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm24, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: addq $200, %rsp +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $200, %rsp -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm28 +; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 ; AVX512BW-NEXT: vmovaps 640(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm15 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19 -; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 +; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -2210,54 +2213,54 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm24 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm6 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2265,99 +2268,102 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm16 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm11 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm24, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $200, %rsp +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2385,17 +2391,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 ; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps 320(%rdi), %xmm2 ; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm2 +; SSE-NEXT: movaps 960(%rdi), %xmm1 ; SSE-NEXT: movaps 896(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 448(%rdi), %xmm4 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 576(%rdi), %xmm4 +; SSE-NEXT: movaps 576(%rdi), %xmm3 ; SSE-NEXT: movaps 512(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 704(%rdi), %xmm6 +; SSE-NEXT: movaps 704(%rdi), %xmm5 ; SSE-NEXT: movaps 640(%rdi), %xmm14 ; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rdi), %xmm13 @@ -2405,34 +2411,34 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 768(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2703,7 +2709,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1696(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm0 @@ -2714,13 +2720,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2016(%rdi), %xmm0 -; SSE-NEXT: movaps 1952(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 1952(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2760,7 +2767,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2769,54 +2776,53 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm0 -; SSE-NEXT: movaps 944(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 944(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1136(%rdi), %xmm0 -; SSE-NEXT: movaps 1072(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 1072(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 1264(%rdi), %xmm0 -; SSE-NEXT: movaps 1200(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 1200(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 1392(%rdi), %xmm0 -; SSE-NEXT: movaps 1328(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 1328(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1520(%rdi), %xmm0 -; SSE-NEXT: movaps 1456(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 1648(%rdi), %xmm0 -; SSE-NEXT: movaps 1584(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 1456(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 1776(%rdi), %xmm0 -; SSE-NEXT: movaps 1712(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: movaps 1648(%rdi), %xmm0 +; SSE-NEXT: movaps 1584(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 1776(%rdi), %xmm0 +; SSE-NEXT: movaps 1712(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps 1904(%rdi), %xmm0 ; SSE-NEXT: movaps 1840(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps 2032(%rdi), %xmm0 -; SSE-NEXT: movaps 1968(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 1968(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2949,7 +2955,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -2978,7 +2984,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3010,13 +3017,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 240(%rax) -; SSE-NEXT: movaps %xmm5, 224(%rax) -; SSE-NEXT: movaps %xmm9, 208(%rax) -; SSE-NEXT: movaps %xmm11, 192(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps %xmm3, 240(%rax) +; SSE-NEXT: movaps %xmm6, 224(%rax) +; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movaps %xmm9, 192(%rax) +; SSE-NEXT: movaps %xmm10, 176(%rax) +; SSE-NEXT: movaps %xmm15, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3038,18 +3044,18 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 240(%rax) +; SSE-NEXT: movaps %xmm2, 240(%rax) ; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps %xmm2, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm8, 176(%rax) -; SSE-NEXT: movaps %xmm6, 160(%rax) -; SSE-NEXT: movaps %xmm10, 144(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm4, 208(%rax) +; SSE-NEXT: movaps %xmm5, 192(%rax) +; SSE-NEXT: movaps %xmm7, 176(%rax) +; SSE-NEXT: movaps %xmm13, 160(%rax) +; SSE-NEXT: movaps %xmm14, 144(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm12, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -3066,49 +3072,49 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] @@ -3163,136 +3169,136 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 @@ -3388,7 +3394,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -3399,354 +3405,358 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride8_vf32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride8_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] @@ -3759,7 +3769,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3775,136 +3785,136 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -3971,133 +3981,132 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -4215,11 +4224,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4232,189 +4240,186 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm10 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -4428,507 +4433,508 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm9, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 ; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7] -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm30, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm14, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm15 -; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm22 -; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm25 +; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 ; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm15 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, (%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512F-NEXT: vmovaps %zmm14, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512F-NEXT: vmovaps %zmm10, 64(%rax) +; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm16 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm10 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm31 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -4942,319 +4948,323 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm15 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm22 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm25 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 ; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm15 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, (%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovaps %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512BW-NEXT: vmovaps %zmm10, 64(%rax) +; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -6107,29 +6117,29 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2928(%rdi), %xmm0 -; SSE-NEXT: movaps 2864(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 2864(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 3056(%rdi), %xmm0 -; SSE-NEXT: movaps 2992(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 2992(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 3184(%rdi), %xmm0 -; SSE-NEXT: movaps 3120(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 3120(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 3312(%rdi), %xmm0 -; SSE-NEXT: movaps 3248(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 3248(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 3440(%rdi), %xmm0 ; SSE-NEXT: movaps 3376(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm1 @@ -6137,19 +6147,19 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 3568(%rdi), %xmm0 -; SSE-NEXT: movaps 3504(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 4016(%rdi), %xmm2 -; SSE-NEXT: movaps 3952(%rdi), %xmm4 +; SSE-NEXT: movaps 3504(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 4016(%rdi), %xmm4 +; SSE-NEXT: movaps 3952(%rdi), %xmm3 ; SSE-NEXT: movaps 3696(%rdi), %xmm0 -; SSE-NEXT: movaps 3632(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 3632(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps 4080(%rdi), %xmm1 ; SSE-NEXT: movaps 3888(%rdi), %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 3824(%rdi), %xmm6 ; SSE-NEXT: movaps 3760(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm7, 496(%rsi) @@ -6159,11 +6169,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, 480(%rsi) ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 464(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6546,11 +6556,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 496(%rax) +; SSE-NEXT: movaps %xmm3, 496(%rax) ; SSE-NEXT: movaps %xmm6, 480(%rax) ; SSE-NEXT: movaps %xmm7, 464(%rax) -; SSE-NEXT: movaps %xmm11, 448(%rax) -; SSE-NEXT: movaps %xmm14, 432(%rax) +; SSE-NEXT: movaps %xmm10, 448(%rax) +; SSE-NEXT: movaps %xmm13, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6606,16 +6616,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 496(%rax) +; SSE-NEXT: movaps %xmm4, 496(%rax) ; SSE-NEXT: movaps %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps %xmm3, 448(%rax) -; SSE-NEXT: movaps %xmm10, 432(%rax) +; SSE-NEXT: movaps %xmm2, 448(%rax) +; SSE-NEXT: movaps %xmm8, 432(%rax) ; SSE-NEXT: movaps %xmm12, 416(%rax) -; SSE-NEXT: movaps %xmm8, 400(%rax) -; SSE-NEXT: movaps %xmm9, 384(%rax) -; SSE-NEXT: movaps %xmm13, 368(%rax) -; SSE-NEXT: movaps %xmm15, 352(%rax) +; SSE-NEXT: movaps %xmm9, 400(%rax) +; SSE-NEXT: movaps %xmm15, 384(%rax) +; SSE-NEXT: movaps %xmm11, 368(%rax) +; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6665,7 +6675,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4984, %rsp # imm = 0x1378 +; AVX1-ONLY-NEXT: subq $5016, %rsp # imm = 0x1398 ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -6858,236 +6868,232 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7101,9 +7107,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7117,9 +7121,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7133,9 +7135,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7149,9 +7149,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -7207,7 +7213,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7231,7 +7237,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7346,655 +7352,659 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm9[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8011,7 +8021,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4984, %rsp # imm = 0x1378 +; AVX1-ONLY-NEXT: addq $5016, %rsp # imm = 0x1398 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -8148,259 +8158,258 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] @@ -8415,7 +8424,8 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8427,7 +8437,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8568,180 +8578,180 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 3936(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 4064(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8753,10 +8763,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8839,19 +8849,19 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm6[1],ymm13[3],ymm6[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) @@ -9081,9 +9091,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -9114,98 +9124,105 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i64_stride8_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2] -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2] +; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2] +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2] +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9216,17 +9233,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9241,13 +9259,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 2624(%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9256,20 +9275,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9278,148 +9295,146 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3] -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 @@ -9430,15 +9445,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -9448,72 +9465,72 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6] +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm30 ; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm23 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm28 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9528,16 +9545,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -9549,9 +9564,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -9561,118 +9576,117 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9683,236 +9697,241 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6] +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7] -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm24 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 ; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7] -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm31 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -9923,165 +9942,160 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm20 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm30 -; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm27 -; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm19 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 2240(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm13 -; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm13, %ymm13 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm28 -; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm17 -; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm24 +; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm24 +; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm19 +; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm29 +; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm21 +; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10165,13 +10179,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10185,98 +10201,105 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm30 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2] -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2] +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2] +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2] +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10287,17 +10310,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10312,13 +10336,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm31 +; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10327,20 +10352,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10349,148 +10372,146 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm24 -; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3] -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 @@ -10501,15 +10522,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -10519,72 +10542,72 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10599,16 +10622,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -10620,9 +10641,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -10632,118 +10653,117 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10754,236 +10774,241 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6] +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm8 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7] -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm24 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7] -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -10994,165 +11019,160 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm30 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm27 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm19 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm28 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm19 +; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11236,13 +11256,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index f0118bc3b33b6..19c97ece8937f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -396,238 +396,237 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm14, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] -; SSE-NEXT: packuswb %xmm11, %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm11, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: por %xmm14, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm14, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,1,3] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm12, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movdqa %xmm6, 16(%rdx) -; SSE-NEXT: movdqa %xmm8, (%rdx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm11, (%rcx) +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm11, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: movdqa %xmm3, (%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf32: @@ -768,592 +767,597 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: packuswb %xmm0, %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: packuswb %xmm0, %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm9, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movdqa %xmm6, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movdqa %xmm5, 32(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm7, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) +; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm9, (%rcx) +; SSE-NEXT: movdqa %xmm5, 16(%rcx) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 16(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index d995051642643..764020b3b48c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -297,12 +297,12 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: packuswb %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 @@ -310,72 +310,72 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: packuswb %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: packuswb %xmm15, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm14[0,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: packuswb %xmm15, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm14[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm13, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm8[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] @@ -383,24 +383,24 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] ; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm9, (%rdx) +; SSE-NEXT: movaps %xmm10, (%rdx) ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: retq @@ -528,37 +528,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $120, %rsp +; SSE-NEXT: subq $136, %rsp ; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa 96(%rdi), %xmm15 -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm4 @@ -573,83 +574,85 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: packuswb %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: packuswb %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: packuswb %xmm9, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm8[0,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm1, %xmm4 @@ -660,26 +663,26 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] @@ -687,21 +690,20 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] @@ -712,65 +714,66 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm2, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm7 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm11 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] ; SSE-NEXT: movdqa %xmm6, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) ; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $120, %rsp +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf32: @@ -1024,87 +1027,87 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: subq $664, %rsp # imm = 0x298 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 ; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm1, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 @@ -1117,32 +1120,32 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1153,112 +1156,114 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[0,3] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] @@ -1277,43 +1282,44 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm3[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] @@ -1333,7 +1339,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] @@ -1359,188 +1365,189 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm3 +; SSE-NEXT: packuswb %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; SSE-NEXT: packuswb %xmm11, %xmm12 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm11 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: packuswb %xmm11, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] -; SSE-NEXT: movdqa %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3] +; SSE-NEXT: movdqa %xmm5, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm15, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) ; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm10, (%rcx) +; SSE-NEXT: movaps %xmm12, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) ; SSE-NEXT: movaps %xmm8, 32(%r8) ; SSE-NEXT: movaps %xmm7, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $632, %rsp # imm = 0x278 +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 @@ -1690,104 +1697,107 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $216, %rsp +; AVX2-ONLY-NEXT: subq $168, %rsp ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1800,24 +1810,20 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm1, %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm15 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1827,15 +1833,15 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1846,95 +1852,93 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm1, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm14, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm1, %ymm6 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-ONLY-NEXT: addq $216, %rsp +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-ONLY-NEXT: addq $168, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index fec4b1b0511ce..1532859fb115a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -422,283 +422,276 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pxor %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: packuswb %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[2,0] -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0] +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: por %xmm11, %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[1,2] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm12, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm10, (%rcx) +; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: retq ; @@ -958,156 +951,158 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 144(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] @@ -1118,85 +1113,84 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm11 -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm1 +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] @@ -1204,158 +1198,162 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm11 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[2,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] @@ -1364,68 +1362,69 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[3,0] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] ; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm11[3,0] -; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0,2] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] @@ -1435,77 +1434,77 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: packuswb %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1514,14 +1513,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm12, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm8, 16(%r8) -; SSE-NEXT: movdqa %xmm4, (%r8) +; SSE-NEXT: movdqa %xmm9, 16(%r8) +; SSE-NEXT: movdqa %xmm8, (%r8) ; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf32: @@ -1541,13 +1540,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] @@ -1560,18 +1559,18 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1580,26 +1579,26 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1607,14 +1606,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1625,23 +1624,23 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm15 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -1654,14 +1653,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2012,61 +2011,62 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $568, %rsp # imm = 0x238 +; SSE-NEXT: subq $552, %rsp # imm = 0x228 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pxor %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: pand %xmm8, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] @@ -2074,70 +2074,74 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] @@ -2145,13 +2149,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3] @@ -2159,663 +2163,617 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa 240(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm6 -; SSE-NEXT: packuswb %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm4 +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[0,2] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[2,0] +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] @@ -2824,12 +2782,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] @@ -2838,8 +2796,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] @@ -2847,16 +2804,18 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] @@ -2867,102 +2826,140 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: pxor %xmm10, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] @@ -2970,142 +2967,143 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm1[1,2] -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm10[3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3122,441 +3120,435 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm14, 16(%r8) -; SSE-NEXT: movdqa %xmm6, 48(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm4, 48(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%r8) +; SSE-NEXT: movdqa %xmm10, 48(%r8) +; SSE-NEXT: movdqa %xmm11, (%r8) +; SSE-NEXT: movdqa %xmm14, 32(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: addq $568, %rsp # imm = 0x238 +; SSE-NEXT: addq $552, %rsp # imm = 0x228 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm8, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,7,12] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm8, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,2,7,12,0,0,128,128,128,2,7,12,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u],zero,zero,zero,xmm8[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,8,13],zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,8,13],zero,zero,zero,zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[4,9,14],zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -3564,246 +3556,247 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm15, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm12 -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm9, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm1 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm15, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm13, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm8, %ymm13 -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm15, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm14 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm15, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7],ymm9[8,9,10,11,12],ymm4[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7],ymm13[8,9,10,11,12],ymm7[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm9 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm11 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm15 +; AVX2-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX2-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm13, %ymm13 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm15 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3815,12 +3808,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 70548501cfe76..6d38dcbbcf074 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -97,73 +97,73 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movd %xmm1, (%rsi) -; SSE-NEXT: movd %xmm5, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) -; SSE-NEXT: movd %xmm7, (%r8) -; SSE-NEXT: movd %xmm0, (%r9) +; SSE-NEXT: movd %xmm0, (%rsi) +; SSE-NEXT: movd %xmm4, (%rdx) +; SSE-NEXT: movd %xmm7, (%rcx) +; SSE-NEXT: movd %xmm6, (%r8) +; SSE-NEXT: movd %xmm1, (%r9) ; SSE-NEXT: movd %xmm2, (%rax) ; SSE-NEXT: retq ; @@ -470,81 +470,78 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm9 ; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm0 @@ -571,10 +568,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -587,10 +584,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm0 @@ -602,7 +599,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -611,13 +608,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] @@ -626,25 +624,25 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] @@ -657,114 +655,113 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movdqa %xmm11, (%rdx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm4, (%r8) -; SSE-NEXT: movdqa %xmm7, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm12, (%rdx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm10, (%r8) +; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: retq @@ -1080,450 +1077,457 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pand %xmm5, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: psrld $16, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: packuswb %xmm14, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm13[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm10, %xmm0 @@ -1531,191 +1535,196 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3] -; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: packuswb %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm14, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movdqa %xmm12, (%r8) -; SSE-NEXT: movdqa %xmm4, 16(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $168, %rsp -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[4,10] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -1724,15 +1733,16 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 @@ -1743,17 +1753,17 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm10 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1776,8 +1786,8 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -1785,89 +1795,89 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] ; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm5, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm5[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 @@ -1877,71 +1887,71 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm5[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $168, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1949,30 +1959,30 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride6_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] @@ -2007,12 +2017,12 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero ; AVX2-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] @@ -2022,34 +2032,34 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10] +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2289,248 +2299,253 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $824, %rsp # imm = 0x338 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 368(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 272(%rdi), %xmm14 +; SSE-NEXT: movdqa 272(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 ; SSE-NEXT: movdqa 160(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2542,7 +2557,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2555,21 +2570,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2584,52 +2600,54 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm5, %xmm5 @@ -2637,196 +2655,195 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: packuswb %xmm14, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: packuswb %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: packuswb %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2837,42 +2854,43 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2880,42 +2898,42 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2923,42 +2941,43 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2966,587 +2985,587 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm14, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[2,3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: packuswb %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm14, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] ; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: packuswb %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3580,66 +3599,67 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm7, 16(%r9) -; SSE-NEXT: movdqa %xmm6, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movdqa %xmm8, (%rax) -; SSE-NEXT: addq $824, %rsp # imm = 0x338 +; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -3670,529 +3690,547 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm0[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[0,6,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[2,8,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm1[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4214,259 +4252,256 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride6_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm12, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm13 -; AVX2-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm15, %ymm9, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm11, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm9, %ymm8, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm4, %ymm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm5, %ymm7, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm12, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm14, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm11 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm14 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm13, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm6, %ymm5 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7],ymm10[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm8 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 @@ -4474,288 +4509,280 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $88, %rsp -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm10 -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm12 -; AVX512F-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm12 -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-NEXT: subq $40, %rsp +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm25 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm9 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm12, %xmm9 -; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm14 -; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-NEXT: vpshufb %xmm7, %xmm15, %xmm1 -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm13 +; AVX512F-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512F-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX512F-NEXT: vporq %xmm1, %xmm5, %xmm17 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm17 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-NEXT: vpor %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm13, %xmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpor %xmm11, %xmm12, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512F-NEXT: vpor %xmm3, %xmm10, %xmm2 -; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-NEXT: vporq %xmm1, %xmm0, %xmm26 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm13, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm10 -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm11 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm12 -; AVX512F-NEXT: vpor %xmm11, %xmm12, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm11 -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm12 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-NEXT: vpor %xmm8, %xmm10, %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX512F-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm21 = ymm0[2,3],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm0, %ymm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm15 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm15[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm10 = ymm17[2,3],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm17, %ymm25 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512F-NEXT: vpternlogq $202, %ymm10, %ymm25, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, %ymm2, %ymm5, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm17 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm15 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm15 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm5, %zmm26 -; AVX512F-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm28 +; AVX512F-NEXT: vpshufb %xmm8, %xmm9, %xmm4 +; AVX512F-NEXT: vpshufb %xmm12, %xmm13, %xmm6 +; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm21 +; AVX512F-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX512F-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX512F-NEXT: vpor %xmm8, %xmm13, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512F-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm14 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX512F-NEXT: vpshufb %xmm7, %xmm9, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %ymm31, %ymm16, %ymm4 -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX512F-NEXT: vpshufb %xmm8, %xmm11, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,10],zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[u,u,u,u,u,u] -; AVX512F-NEXT: vporq %xmm0, %xmm2, %xmm29 -; AVX512F-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-NEXT: vpternlogq $226, %ymm27, %ymm13, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX512F-NEXT: vporq %xmm3, %xmm7, %xmm20 -; AVX512F-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm16 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm10, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm13 -; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512F-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX512F-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512F-NEXT: vporq %xmm5, %xmm6, %xmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm11, %xmm4, %xmm9 -; AVX512F-NEXT: vporq %xmm6, %xmm9, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm3, %ymm14 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm1 -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm3, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-NEXT: vextracti32x4 $1, %ymm16, %xmm3 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[4,10],zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm8 -; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $226, %ymm25, %ymm8, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm8 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $242, %ymm5, %ymm13, %ymm9 -; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm9 -; AVX512F-NEXT: vinserti32x4 $2, %xmm29, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $226, %zmm9, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $242, %ymm4, %ymm13, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm4 +; AVX512F-NEXT: vporq %xmm0, %xmm6, %xmm16 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm24, %ymm10 +; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX512F-NEXT: vpshufb %xmm12, %xmm7, %xmm12 +; AVX512F-NEXT: vpor %xmm8, %xmm12, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] +; AVX512F-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 +; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512F-NEXT: vpshufb %xmm12, %xmm14, %xmm0 +; AVX512F-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm21 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,u,u,u,1,7,13],zero,zero,zero,xmm10[5,11],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm28 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-NEXT: vpternlogq $226, %ymm26, %ymm11, %ymm7 +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm14, %xmm7, %xmm2 +; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm26 +; AVX512F-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512F-NEXT: vpternlogq $226, %ymm29, %ymm9, %ymm10 +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-NEXT: vporq %xmm2, %xmm4, %xmm27 +; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm11 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm9 +; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512F-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm18 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512F-NEXT: vpor %xmm1, %xmm8, %xmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm3, %ymm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm21 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm4, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512F-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX512F-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm9, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $242, %ymm9, %ymm12, %ymm10 +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm9 +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 +; AVX512F-NEXT: vinserti32x4 $2, %xmm26, %zmm10, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $226, %zmm10, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm3 -; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm9, %zmm4 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm12 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm2 -; AVX512F-NEXT: vpternlogq $184, %zmm14, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $242, %ymm0, %ymm12, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti32x4 $2, %xmm18, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 +; AVX512F-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-NEXT: addq $88, %rsp +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: addq $40, %rsp ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 0cd7ba03c66cd..44459e1756009 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -20,50 +20,50 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movd %xmm3, %edi +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movd %xmm1, %edi ; SSE-NEXT: movw %di, (%rsi) ; SSE-NEXT: movd %xmm4, %esi ; SSE-NEXT: movw %si, (%rdx) -; SSE-NEXT: movd %xmm6, %edx +; SSE-NEXT: movd %xmm5, %edx ; SSE-NEXT: movw %dx, (%rcx) -; SSE-NEXT: movd %xmm5, %ecx +; SSE-NEXT: movd %xmm6, %ecx ; SSE-NEXT: movw %cx, (%r8) ; SSE-NEXT: movd %xmm7, %ecx ; SSE-NEXT: movw %cx, (%r9) ; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: movw %cx, (%r10) -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movw %cx, (%rax) ; SSE-NEXT: retq ; @@ -108,67 +108,67 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,0,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -176,39 +176,39 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,0,2,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3],xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movd %xmm2, (%rsi) -; SSE-NEXT: movd %xmm3, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) -; SSE-NEXT: movd %xmm4, (%r8) -; SSE-NEXT: movd %xmm5, (%r9) +; SSE-NEXT: movd %xmm5, (%rdx) +; SSE-NEXT: movd %xmm3, (%rcx) +; SSE-NEXT: movd %xmm6, (%r8) +; SSE-NEXT: movd %xmm4, (%r9) ; SSE-NEXT: movd %xmm8, (%rdi) ; SSE-NEXT: movd %xmm0, (%rax) ; SSE-NEXT: retq @@ -389,102 +389,108 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm12 ; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[2,3] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -495,15 +501,15 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 @@ -511,108 +517,114 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm8, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm11, (%r8) -; SSE-NEXT: movq %xmm0, (%r9) -; SSE-NEXT: movq %xmm1, (%rdi) -; SSE-NEXT: movq %xmm2, (%rax) -; SSE-NEXT: retq -; -; AVX1-ONLY-LABEL: load_i8_stride7_vf8: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movq %xmm13, (%rsi) +; SSE-NEXT: movq %xmm9, (%rdx) +; SSE-NEXT: movq %xmm8, (%rcx) +; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: movq %xmm10, (%r9) +; SSE-NEXT: movq %xmm11, (%rdi) +; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-ONLY-LABEL: load_i8_stride7_vf8: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] @@ -858,522 +870,534 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm15, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pslld $16, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: packuswb %xmm14, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: packuswb %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: packuswb %xmm2, %xmm10 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: por %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm5 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,3] +; SSE-NEXT: packuswb %xmm7, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm3, (%r8) -; SSE-NEXT: movdqa %xmm0, (%r9) +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, (%rax) -; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] @@ -1384,8 +1408,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] @@ -1398,13 +1422,13 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vpxor %xmm12, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1414,11 +1438,11 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1428,12 +1452,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1443,10 +1467,10 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] @@ -1459,12 +1483,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 @@ -1473,13 +1497,13 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] ; AVX1-ONLY-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7] @@ -1807,29 +1831,28 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 208(%rdi), %xmm9 -; SSE-NEXT: movdqa 192(%rdi), %xmm6 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: subq $648, %rsp # imm = 0x288 +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm14, %xmm14 +; SSE-NEXT: pxor %xmm10, %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1838,20 +1861,25 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -1862,55 +1890,55 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1919,19 +1947,22 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -1941,125 +1972,130 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 @@ -2067,785 +2103,789 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE-NEXT: packuswb %xmm13, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm15, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: packuswb %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: packuswb %xmm1, %xmm14 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,2,1] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: andps %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm6[0],xmm12[1,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm13, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm14, %xmm6 -; SSE-NEXT: andps %xmm10, %xmm12 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm12, %xmm5 +; SSE-NEXT: andps %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm12[0],xmm11[1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: andps %xmm3, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: andps %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pxor %xmm12, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] ; SSE-NEXT: packuswb %xmm11, %xmm10 -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm8[0],xmm10[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm12 -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm12, %xmm8 -; SSE-NEXT: andps %xmm9, %xmm10 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] -; SSE-NEXT: packuswb %xmm12, %xmm11 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm10[0],xmm11[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: andps %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm10 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm10, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2854,329 +2894,326 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movdqa %xmm4, (%r9) -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm14, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: addq $632, %rsp # imm = 0x278 +; SSE-NEXT: addq $648, %rsp # imm = 0x288 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,5,12],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,6,13],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm10[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u],zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,6,13],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpxor %xmm14, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps (%rsp), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[2,9,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[3,10,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $200, %rsp @@ -3185,643 +3222,637 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $104, %rsp -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX2-SLOW-NEXT: subq $72, %rsp +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX2-SLOW-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $104, %rsp +; AVX2-SLOW-NEXT: addq $72, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4,5],ymm9[6],ymm1[7,8,9],ymm9[10],ymm1[11,12,13],ymm9[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,3,4,6,1,3,4,6] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12] +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14] -; AVX2-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14] +; AVX2-FAST-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4,5,6],ymm7[7,8],ymm11[9,10],ymm7[11],ymm11[12,13,14],ymm7[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3],ymm11[4],ymm9[5,6],ymm11[7,8],ymm9[9,10,11],ymm11[12],ymm9[13,14],ymm11[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7,8],ymm9[9],ymm2[10,11,12],ymm9[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm14[1,2,3,4,5,6,7],ymm7[8],ymm14[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm12[1,2,3,4,5,6,7],ymm4[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4629,26 +4660,26 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1512, %rsp # imm = 0x5E8 -; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8 +; SSE-NEXT: movdqa 208(%rdi), %xmm12 ; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -4661,35 +4692,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4698,36 +4729,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4739,37 +4769,37 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4779,32 +4809,32 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 320(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 384(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4816,37 +4846,38 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa 400(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa 416(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4856,32 +4887,31 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4893,283 +4923,283 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 @@ -5179,19 +5209,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -5199,522 +5229,531 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pslld $16, %xmm14 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pslld $16, %xmm13 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm8 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: packuswb %xmm9, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm15 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: packuswb %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] +; SSE-NEXT: packuswb %xmm2, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 @@ -5722,282 +5761,284 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6006,699 +6047,709 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pxor %xmm12, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm12[0],xmm8[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,1,0,3] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm13, %xmm7 +; SSE-NEXT: andps %xmm5, %xmm9 +; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] ; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm15[0],xmm8[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: andps %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: andps %xmm5, %xmm15 +; SSE-NEXT: por %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3] +; SSE-NEXT: packuswb %xmm12, %xmm8 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: andps %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] +; SSE-NEXT: packuswb %xmm13, %xmm8 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[3,3,3,3] -; SSE-NEXT: packuswb %xmm10, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: andps %xmm4, %xmm8 +; SSE-NEXT: por %xmm8, %xmm13 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[3,3,3,3] -; SSE-NEXT: packuswb %xmm13, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pxor %xmm15, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm14 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm9, 48(%rax) +; SSE-NEXT: movdqa %xmm7, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: addq $1512, %rsp # imm = 0x5E8 +; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm13, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 32(%rax) +; SSE-NEXT: movdqa %xmm3, 16(%rax) +; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: @@ -6710,7 +6761,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] @@ -6752,738 +6803,777 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm6, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm10, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm12 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm9, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX1-ONLY-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[3,10] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm10[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,xmm6[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,xmm13[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7511,17 +7601,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: subq $760, %rsp # imm = 0x2F8 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -7529,9 +7619,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -7553,242 +7643,240 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm6, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm8 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm14, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -7796,244 +7884,243 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm7 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm11 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8042,38 +8129,37 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-SLOW-NEXT: addq $760, %rsp # imm = 0x2F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -8083,239 +8169,250 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,2,0,2,1,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,0,2,1,3,4,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm12, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm14, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -8323,37 +8420,40 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm14, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -8361,192 +8461,198 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5,6],ymm12[7,8],ymm1[9,10],ymm12[11],ymm1[12,13,14],ymm12[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5,6],ymm1[7,8],ymm2[9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1,2,3],ymm1[4],ymm5[5,6],ymm1[7,8],ymm5[9,10,11],ymm1[12],ymm5[13,14],ymm1[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6],ymm2[7,8],ymm7[9,10,11],ymm2[12],ymm7[13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7,8],ymm8[9],ymm13[10,11,12],ymm8[13],ymm13[14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7,8],ymm8[9],ymm11[10,11,12],ymm8[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,1,2,1,3,5,6] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm8[1,2,3,4,5,6,7],ymm6[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm8[1,2,3,4,5,6,7],ymm3[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8555,28 +8661,28 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) -; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: addq $776, %rsp # imm = 0x308 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: subq $760, %rsp # imm = 0x2F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -8584,9 +8690,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -8608,242 +8714,240 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm6, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm10, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm15, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -8851,244 +8955,243 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm15, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -9097,2061 +9200,2077 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $760, %rsp # imm = 0x2F8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: subq $72, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4,5],ymm5[6],ymm2[7,8,9],ymm5[10],ymm2[11,12,13],ymm5[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7,8,9],ymm13[10],ymm3[11,12,13],ymm13[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7,8,9],ymm0[10],ymm7[11,12,13],ymm0[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6],ymm6[7,8],ymm5[9,10],ymm6[11],ymm5[12,13,14],ymm6[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5,6],ymm0[7,8],ymm3[9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8,9,10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm14[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4,5,6],ymm13[7,8],ymm10[9,10],ymm13[11],ymm10[12,13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6],ymm6[7,8],ymm5[9,10,11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm9, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm9, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm19, %ymm12, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3],ymm11[4],ymm2[5,6],ymm11[7,8],ymm2[9,10,11],ymm11[12],ymm2[13,14],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3],ymm13[4],ymm10[5,6],ymm13[7,8],ymm10[9,10,11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7,8],ymm10[9],ymm7[10,11,12],ymm10[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm12, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm1, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm2, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7,8],ymm11[9],ymm8[10,11],ymm11[12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm5, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm5, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm1, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm27, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm23, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm27, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: popq %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $72, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: pushq %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 288(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4,5],ymm2[6],ymm14[7,8,9],ymm2[10],ymm14[11,12,13],ymm2[14],ymm14[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7,8,9],ymm3[10],ymm4[11,12,13],ymm3[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5],ymm9[6],ymm5[7,8,9],ymm9[10],ymm5[11,12,13],ymm9[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm5, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8,9,10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2],ymm7[3],ymm5[4,5,6],ymm7[7,8],ymm5[9,10],ymm7[11],ymm5[12,13,14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6],ymm5[7,8],ymm4[9,10],ymm5[11],ymm4[12,13,14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6],ymm3[7,8],ymm2[9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm6, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm9[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6],ymm7[7,8],ymm6[9,10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm6, %ymm27, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm29, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %xmm4, %xmm2, %xmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm17, %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3],ymm11[4],ymm2[5,6],ymm11[7,8],ymm2[9,10,11],ymm11[12],ymm2[13,14],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm29, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6,7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3],ymm10[4],ymm8[5,6],ymm10[7,8],ymm8[9,10,11],ymm10[12],ymm8[13,14],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm3, %ymm25, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7,8],ymm10[9],ymm14[10,11],ymm10[12],ymm14[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7,8],ymm13[9],ymm11[10,11,12],ymm13[13],ymm11[14,15] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm22, %ymm14, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm26, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: popq %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: subq $104, %rsp +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4,5],ymm10[6],ymm13[7,8,9],ymm10[10],ymm13[11,12,13],ymm10[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm1[2],ymm4[3,4,5],ymm1[6],ymm4[7,8,9],ymm1[10],ymm4[11,12,13],ymm1[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7,8,9],ymm1[10],ymm5[11,12,13],ymm1[14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm24, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3],ymm5[4,5,6],ymm2[7,8],ymm5[9,10],ymm2[11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1,2],ymm6[3],ymm13[4,5,6],ymm6[7,8],ymm13[9,10],ymm6[11],ymm13[12,13,14],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5,6],ymm1[7,8],ymm2[9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm1 -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3],ymm4[4,5,6],ymm11[7,8],ymm4[9,10],ymm11[11],ymm4[12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3],ymm10[4],ymm6[5,6],ymm10[7,8],ymm6[9,10,11],ymm10[12],ymm6[13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm24, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm23 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %xmm2, %xmm0, %xmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm19, %ymm9, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6],ymm1[7,8],ymm2[9,10,11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm23, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6,7,8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7,8],ymm1[9],ymm9[10,11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm10, %ymm15 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vporq %xmm4, %xmm0, %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm15, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3],ymm11[4],ymm10[5,6],ymm11[7,8],ymm10[9,10,11],ymm11[12],ymm10[13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm10 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm23, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm12, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7,8],ymm11[9],ymm15[10,11,12],ymm11[13],ymm15[14,15] +; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm6, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm29, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm23, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm17, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm16[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-SLOW-NEXT: addq $104, %rsp ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: pushq %rax -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7,8,9],ymm10[10],ymm2[11,12],ymm10[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 ; AVX512DQ-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,6,1,2,4,6] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512DQ-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 288(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4,5],ymm3[6],ymm14[7,8,9],ymm3[10],ymm14[11,12,13],ymm3[14],ymm14[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm3, %ymm22, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm20, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4,5],ymm10[6],ymm6[7,8,9],ymm10[10],ymm6[11,12,13],ymm10[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm6, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm28, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6],ymm6[7,8],ymm5[9,10],ymm6[11],ymm5[12,13,14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5,6],ymm7[7,8],ymm6[9,10],ymm7[11],ymm6[12,13,14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm9[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6],ymm7[7,8],ymm6[9,10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm22, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1,2],ymm10[3],ymm3[4,5,6],ymm10[7,8],ymm3[9,10],ymm10[11],ymm3[12,13,14],ymm10[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm27 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %xmm5, %xmm2, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm17, %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm12 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7,8],ymm8[9],ymm6[10,11,12],ymm8[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %xmm6, %xmm3, %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6],ymm0[7,8],ymm2[9,10,11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm11, %ymm26, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6],ymm0[7,8],ymm8[9,10,11],ymm0[12],ymm8[13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm1, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm5, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7,8],ymm0[9],ymm11[10,11,12],ymm0[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm9 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm29, %ymm14, %ymm13 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm26, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm29, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 ; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512BW-ONLY-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k3 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11165,286 +11284,284 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm12 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k6} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512BW-ONLY-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512BW-ONLY-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm21 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm21, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm11 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm8, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm11, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm8, %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm19, %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm19, %ymm8 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm9, %zmm0 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm9 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm12, %zmm11 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] @@ -11453,396 +11570,396 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm1, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQBW-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k2 ; AVX512DQBW-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQBW-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 ; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 ; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQBW-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512DQBW-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX512DQBW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 ; AVX512DQBW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k2 ; AVX512DQBW-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm22 {%k6} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQBW-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} ; AVX512DQBW-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm2 +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7],ymm12[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rdi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -11850,43 +11967,43 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k2 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k3 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11900,312 +12017,313 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 ; AVX512DQBW-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQBW-FAST-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm12 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k7} -; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} +; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQBW-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512DQBW-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 ; AVX512DQBW-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512DQBW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 ; AVX512DQBW-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm14, %zmm16 -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm21 {%k4} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k6} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k4} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm0 {%k3} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] ; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm11, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm9, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm21, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm11 {%k5} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQBW-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm1, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm3, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 9c6b03d69af65..3ca2aae97def0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -381,11 +381,11 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax ; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa 48(%rdi), %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 @@ -399,58 +399,58 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] ; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -475,51 +475,51 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] ; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: pand %xmm3, %xmm9 @@ -528,42 +528,42 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] ; SSE-NEXT: packuswb %xmm13, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm2 @@ -576,8 +576,8 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlps %xmm0, (%rdx) ; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) -; SSE-NEXT: movq %xmm12, (%r9) +; SSE-NEXT: movq %xmm15, (%r8) +; SSE-NEXT: movq %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -816,13 +816,12 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $328, %rsp # imm = 0x148 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 ; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 96(%rdi), %xmm12 ; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] @@ -832,141 +831,145 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm3 ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: packuswb %xmm11, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: por %xmm9, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 @@ -977,7 +980,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pandn %xmm9, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 @@ -985,7 +988,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm15, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] @@ -1011,14 +1014,14 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm15, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] @@ -1032,12 +1035,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1049,22 +1051,23 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 @@ -1075,14 +1078,14 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1103,12 +1106,12 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: # xmm12 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] @@ -1134,7 +1137,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pandn %xmm9, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1142,7 +1145,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm14, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] @@ -1161,21 +1164,21 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm9, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: packuswb %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,1,4,5,6,7] @@ -1190,35 +1193,35 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1230,10 +1233,10 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -1245,10 +1248,10 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1263,7 +1266,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: addq $328, %rsp # imm = 0x148 @@ -1961,20 +1964,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] @@ -2044,7 +2046,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2072,7 +2074,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2153,19 +2155,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm15 @@ -2225,9 +2227,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm11, %xmm7 ; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 @@ -2245,9 +2247,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 @@ -2430,9 +2432,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm9, %xmm5 @@ -2447,9 +2449,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 @@ -2504,9 +2506,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2610,8 +2612,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2671,7 +2673,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshuflw $116, (%rsp), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa %xmm9, %xmm2 @@ -2684,7 +2686,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 @@ -2747,7 +2749,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -2813,7 +2815,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm7, (%r9) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: movapd %xmm6, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm4, (%rax) @@ -3263,14 +3265,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i8_stride8_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3283,8 +3284,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -3294,21 +3294,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -3326,10 +3327,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -3338,8 +3338,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -3348,28 +3349,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3381,7 +3380,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3390,10 +3389,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -3402,34 +3401,35 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -3440,47 +3440,49 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3491,47 +3493,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3545,18 +3546,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -3566,22 +3569,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3595,43 +3597,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3648,38 +3650,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3710,184 +3712,193 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FAST-LABEL: load_i8_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $232, %rsp +; AVX2-FAST-NEXT: subq $248, %rsp ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm10 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm11 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] @@ -3898,29 +3909,29 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] @@ -3928,25 +3939,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3957,26 +3968,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -3984,7 +3995,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3993,30 +4004,29 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $232, %rsp +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $248, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4029,8 +4039,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4040,21 +4049,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -4072,10 +4082,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -4084,8 +4093,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4094,28 +4104,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -4127,7 +4135,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4136,10 +4144,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -4148,34 +4156,35 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -4186,47 +4195,49 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4237,47 +4248,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4291,18 +4301,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -4312,22 +4324,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4341,43 +4352,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4394,38 +4405,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4460,91 +4471,91 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpmovqb %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] ; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm20 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4552,41 +4563,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4594,43 +4607,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4638,41 +4649,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm7 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 @@ -4680,45 +4691,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -4726,43 +4735,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 @@ -4770,11 +4777,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -4782,24 +4790,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 @@ -5588,7 +5594,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2024, %rsp # imm = 0x7E8 +; SSE-NEXT: subq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: movdqa 64(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm8 @@ -5599,9 +5605,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 144(%rdi), %xmm10 ; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm15 ; SSE-NEXT: movdqa 224(%rdi), %xmm9 ; SSE-NEXT: movdqa 240(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] @@ -5611,8 +5618,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -5650,8 +5658,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 @@ -5674,10 +5682,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 @@ -5739,20 +5747,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 @@ -5820,14 +5827,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: packuswb %xmm15, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm2 @@ -5849,12 +5856,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -5883,15 +5890,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -6020,7 +6027,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6031,7 +6038,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6065,19 +6072,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6093,7 +6100,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] @@ -6101,56 +6108,55 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; SSE-NEXT: packuswb %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm5 @@ -6337,26 +6343,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6415,7 +6421,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -6550,7 +6556,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -6560,15 +6566,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 @@ -6576,7 +6582,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -6740,14 +6747,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] @@ -6872,15 +6879,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -7019,9 +7026,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -7063,11 +7071,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -7358,8 +7365,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 @@ -7449,7 +7455,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 32(%rax) -; SSE-NEXT: movapd %xmm6, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7464,7 +7471,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm1, 32(%rax) ; SSE-NEXT: movapd %xmm2, 16(%rax) ; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: addq $2024, %rsp # imm = 0x7E8 +; SSE-NEXT: addq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf64: @@ -7629,15 +7636,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -7730,11 +7737,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm6 @@ -7861,15 +7868,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7962,13 +7969,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8062,13 +8069,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8329,72 +8336,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8402,118 +8410,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8521,100 +8528,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8623,100 +8631,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8724,104 +8733,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8829,103 +8834,106 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8933,101 +8941,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -9036,101 +9043,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -9138,50 +9144,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) @@ -9231,20 +9238,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm14 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 @@ -9269,19 +9275,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] @@ -9291,7 +9294,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9306,10 +9309,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 @@ -9330,7 +9333,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9340,55 +9343,53 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 @@ -9396,12 +9397,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -9421,57 +9423,55 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm15 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -9479,49 +9479,49 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 @@ -9529,62 +9529,61 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] @@ -9608,25 +9607,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] @@ -9650,59 +9650,59 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] @@ -9726,61 +9726,64 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] @@ -9801,62 +9804,64 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -9900,72 +9905,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -9973,118 +9979,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10092,100 +10097,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10194,100 +10200,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10295,104 +10302,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10400,103 +10403,106 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10504,101 +10510,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10607,101 +10612,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -10709,50 +10713,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) @@ -10793,23 +10798,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 ; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -10821,19 +10828,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpmovqb %zmm17, %xmm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: movb $-64, %al @@ -10841,18 +10849,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 @@ -10865,397 +10873,397 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-SLOW-NEXT: vpmovqb %zmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm27, %zmm7 -; AVX512F-SLOW-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm17, %zmm6 +; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm19 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11264,98 +11272,102 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -11363,86 +11375,88 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 @@ -11451,73 +11465,72 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm23, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm26, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11529,307 +11542,298 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $408, %rsp # imm = 0x198 +; AVX512F-FAST-NEXT: subq $440, %rsp # imm = 0x1B8 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %ymm31 -; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-FAST-NEXT: vpmovqb %zmm26, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-FAST-NEXT: vpmovqb %zmm29, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512F-FAST-NEXT: vpsrlq $8, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm15 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm5 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm29 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm20 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $24, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm21 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm16 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -11837,254 +11841,259 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm18 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm24 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm22 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm10 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm24, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 ; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm24 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm11 -; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm29, %zmm12 +; AVX512F-FAST-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm25, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm21 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm19, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm29, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm16 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm25, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm19, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm18, %zmm13 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm12 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm26, %zmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm25, %zmm11 ; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12096,674 +12105,679 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512F-FAST-NEXT: addq $440, %rsp # imm = 0x1B8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm5, %xmm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm4, %xmm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20 ; AVX512BW-SLOW-NEXT: movb $-64, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 ; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm18[0],xmm13[0],xmm18[1],xmm13[1],xmm18[2],xmm13[2],xmm18[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm18[0],xmm15[1],xmm18[1],xmm15[2],xmm18[2],xmm15[3],xmm18[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] ; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm21 ; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 416(%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm30, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm22 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm13, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm12, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm4, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm4 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm20 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm31, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm24 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm25, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm0, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm0, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm6, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm6, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm23, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm23 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm22 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm18, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm26[0],xmm0[0],xmm26[1],xmm0[1],xmm26[2],xmm0[2],xmm26[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm29 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm22, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm6, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm22 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm23, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm28, %zmm2 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm10, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm2 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm10, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm13, %zmm6 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm20, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm30, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm30, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm10, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm20, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm30, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm13, %zmm7 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm24, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm10, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm7 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm6 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm31, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm20, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm14, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -12792,128 +12806,129 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-LABEL: load_i8_stride8_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm31 ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm31, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm17, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm19, %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm28 -; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm29 +; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm3 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm20, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 352(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm19, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm21, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa64 336(%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm29 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm6 +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-FAST-NEXT: vmovdqa 336(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm6 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-FAST-NEXT: vpmovqb %zmm4, %xmm5 +; AVX512BW-FAST-NEXT: vpmovqb %zmm18, %xmm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm21 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 ; AVX512BW-FAST-NEXT: movb $-64, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm16, %ymm1 +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm6 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm9 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm16, %ymm8 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm25 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm25, %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm23, %xmm7 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm26, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm24, %xmm7 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FAST-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm22, %xmm26 -; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm9, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm26[0],xmm24[1],xmm26[1],xmm24[2],xmm26[2],xmm24[3],xmm26[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm25 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm31, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm31, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm6 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm17, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm19, %ymm13 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm15[0],xmm24[1],xmm15[1],xmm24[2],xmm15[2],xmm24[3],xmm15[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm26 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3] +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-FAST-NEXT: vmovdqa %xmm12, %xmm7 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm20 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm16, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm25, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm24, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm22, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm9, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] ; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm10, %zmm13 @@ -12922,95 +12937,97 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm31, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm17, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa %xmm4, %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm18, %xmm26 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm21[0],xmm15[0],xmm21[1],xmm15[1],xmm21[2],xmm15[2],xmm21[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm20 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm31, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm17, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm19, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm20, %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm19, %xmm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm18, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm29, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm4, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm18, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm16, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm25, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm23, %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm26, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm24, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm10, %zmm2 @@ -13020,183 +13037,184 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm3, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm2 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm3, %ymm14 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm16 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm18, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm29, %xmm28 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm28, %xmm29 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm17, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm12 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm9 ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm18, %ymm15 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm15 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm20 +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm21 ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm20, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm21, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm9, %xmm13 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm22, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm8, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm8 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm11 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm11 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm17, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm27 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm27 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm17, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm25, %zmm12 ; AVX512BW-FAST-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm19, %ymm8 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm12 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm20, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm19, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm21 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm3 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm14, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm17, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm28, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $48, %zmm27, %zmm12 ; AVX512BW-FAST-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm20, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm21, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm25, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX512BW-FAST-NEXT: vpsrlq $48, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm8, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm0 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm5, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm17, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm29, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] -; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm27, %zmm8 -; AVX512BW-FAST-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm28, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm27, %zmm9 +; AVX512BW-FAST-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm18, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm15, %ymm8 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm20, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm15, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm21, %ymm11 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm26, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm10, %zmm4 @@ -13212,9 +13230,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%rcx) ; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, (%r9) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 8f160e2bafda0..3cc97d69e8b03 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1042,75 +1042,75 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rcx) +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1119,75 +1119,75 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1308,18 +1308,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: subq $328, %rsp # imm = 0x148 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm9 ; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1327,7 +1326,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] @@ -1336,7 +1335,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 @@ -1348,225 +1347,223 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 32(%rsi), %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 64(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa 64(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa 80(%rsi), %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa 96(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 ; SSE-NEXT: movdqa 112(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 112(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1575,71 +1572,71 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $250, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, 368(%rcx) -; SSE-NEXT: movdqa %xmm2, 320(%rcx) +; SSE-NEXT: movdqa %xmm3, 320(%rcx) ; SSE-NEXT: movdqa %xmm1, 272(%rcx) -; SSE-NEXT: movdqa %xmm12, 224(%rcx) -; SSE-NEXT: movdqa %xmm15, 176(%rcx) -; SSE-NEXT: movdqa %xmm3, 128(%rcx) -; SSE-NEXT: movdqa %xmm4, 80(%rcx) -; SSE-NEXT: movdqa %xmm5, 32(%rcx) +; SSE-NEXT: movdqa %xmm14, 224(%rcx) +; SSE-NEXT: movdqa %xmm2, 176(%rcx) +; SSE-NEXT: movdqa %xmm4, 128(%rcx) +; SSE-NEXT: movdqa %xmm5, 80(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) ; SSE-NEXT: movdqa %xmm11, 352(%rcx) -; SSE-NEXT: movdqa %xmm7, 336(%rcx) -; SSE-NEXT: movdqa %xmm9, 304(%rcx) -; SSE-NEXT: movdqa %xmm13, 288(%rcx) -; SSE-NEXT: movdqa %xmm14, 256(%rcx) +; SSE-NEXT: movdqa %xmm10, 336(%rcx) +; SSE-NEXT: movdqa %xmm13, 304(%rcx) +; SSE-NEXT: movdqa %xmm15, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1662,27 +1659,27 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: addq $328, %rsp # imm = 0x148 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 @@ -1701,38 +1698,38 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 @@ -1740,7 +1737,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 @@ -1749,7 +1746,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 @@ -1761,104 +1758,107 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm10[2],xmm14[3,4],xmm10[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3,4],xmm11[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 288(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 368(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 288(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 368(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 336(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 336(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1867,8 +1867,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 240(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1889,30 +1888,30 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7> ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] @@ -1923,8 +1922,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm10 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] @@ -1935,8 +1934,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 @@ -1948,7 +1947,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[3,3,3,3,4,5,6,7] @@ -1959,7 +1958,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -1977,58 +1976,58 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2],xmm4[3,4],xmm10[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3,4],xmm14[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm13 ; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm15 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm15 ; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-SLOW-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u> -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 320(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 288(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) @@ -2044,137 +2043,138 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 96(%rcx) +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 352(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) @@ -2187,137 +2187,138 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index bc25bb39f9691..6cd1f13398c60 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -1355,12 +1355,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] ; SSE-NEXT: movdqa %xmm13, %xmm0 @@ -1369,98 +1370,97 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm0 -; SSE-NEXT: movdqa 64(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa 64(%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa 64(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa 80(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa 80(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa 80(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 96(%rdx), %xmm1 -; SSE-NEXT: movdqa 96(%rcx), %xmm4 +; SSE-NEXT: movdqa 96(%rcx), %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm2 -; SSE-NEXT: movdqa 112(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movdqa 112(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rsi), %xmm8 +; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, 496(%r8) -; SSE-NEXT: movdqa %xmm7, 480(%r8) +; SSE-NEXT: movdqa %xmm4, 480(%r8) ; SSE-NEXT: movdqa %xmm1, 464(%r8) -; SSE-NEXT: movdqa %xmm4, 448(%r8) -; SSE-NEXT: movdqa %xmm3, 432(%r8) -; SSE-NEXT: movdqa %xmm11, 416(%r8) -; SSE-NEXT: movdqa %xmm6, 400(%r8) -; SSE-NEXT: movdqa %xmm13, 384(%r8) -; SSE-NEXT: movdqa %xmm5, 368(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%r8) -; SSE-NEXT: movdqa %xmm15, 336(%r8) +; SSE-NEXT: movdqa %xmm3, 448(%r8) +; SSE-NEXT: movdqa %xmm5, 432(%r8) +; SSE-NEXT: movdqa %xmm10, 416(%r8) +; SSE-NEXT: movdqa %xmm9, 400(%r8) +; SSE-NEXT: movdqa %xmm12, 384(%r8) +; SSE-NEXT: movdqa %xmm11, 368(%r8) +; SSE-NEXT: movdqa %xmm15, 352(%r8) +; SSE-NEXT: movdqa %xmm8, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movdqa %xmm10, 304(%r8) +; SSE-NEXT: movdqa %xmm13, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movdqa %xmm12, 272(%r8) +; SSE-NEXT: movdqa %xmm14, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movdqa %xmm14, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 92acf21cad010..2670cc353aa17 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -174,20 +174,21 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] ; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] @@ -200,14 +201,14 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movq %xmm3, 32(%r9) ; SSE-NEXT: movdqa %xmm2, (%r9) ; SSE-NEXT: movdqa %xmm4, 16(%r9) @@ -410,18 +411,18 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa (%rsi), %xmm7 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] @@ -432,18 +433,18 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] @@ -454,8 +455,8 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] ; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE-NEXT: psrlq $48, %xmm7 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] @@ -472,15 +473,15 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -493,10 +494,10 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm8, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, 16(%r9) -; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 48(%r9) ; SSE-NEXT: movdqa %xmm1, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) +; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf8: @@ -799,161 +800,157 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm15 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 ; SSE-NEXT: movdqa (%rcx), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa 16(%r8), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm2, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -962,10 +959,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] @@ -979,11 +976,11 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%r9) -; SSE-NEXT: movdqa %xmm12, 16(%r9) +; SSE-NEXT: movdqa %xmm13, 16(%r9) ; SSE-NEXT: movdqa %xmm15, 48(%r9) ; SSE-NEXT: movdqa %xmm9, 64(%r9) ; SSE-NEXT: movdqa %xmm7, 80(%r9) -; SSE-NEXT: movdqa %xmm13, 96(%r9) +; SSE-NEXT: movdqa %xmm12, 96(%r9) ; SSE-NEXT: movdqa %xmm14, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -995,45 +992,45 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2,3,4],xmm7[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 @@ -1043,79 +1040,79 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3,4,5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm13[2],xmm8[3,4,5,6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm9 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm12[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm3[2],xmm6[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm11[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 96(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vzeroupper @@ -1562,147 +1559,151 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm0 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: subq $248, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: movdqa 16(%rdx), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm11 ; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 ; SSE-NEXT: movdqa 32(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 ; SSE-NEXT: movdqa 32(%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movdqa 48(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm7 ; SSE-NEXT: por %xmm7, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: movdqa 48(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] @@ -1713,244 +1714,244 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm12 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm14, 304(%r9) -; SSE-NEXT: movdqa %xmm15, 288(%r9) -; SSE-NEXT: movdqa %xmm2, 256(%r9) +; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, 304(%r9) +; SSE-NEXT: movdqa %xmm10, 288(%r9) +; SSE-NEXT: movdqa %xmm1, 256(%r9) +; SSE-NEXT: movdqa %xmm15, 240(%r9) +; SSE-NEXT: movdqa %xmm14, 224(%r9) +; SSE-NEXT: movdqa %xmm9, 208(%r9) +; SSE-NEXT: movdqa %xmm8, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r9) -; SSE-NEXT: movdqa %xmm9, 224(%r9) -; SSE-NEXT: movdqa %xmm7, 208(%r9) -; SSE-NEXT: movdqa %xmm11, 176(%r9) -; SSE-NEXT: movdqa %xmm13, 160(%r9) +; SSE-NEXT: movaps %xmm0, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1975,134 +1976,134 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp +; AVX1-ONLY-NEXT: subq $56, %rsp ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2,3,4],xmm7[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm15[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -2111,22 +2112,21 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm2 @@ -2135,18 +2135,18 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -2155,46 +2155,48 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] @@ -2208,21 +2210,20 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1,2,3,4],xmm5[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3,4],xmm5[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2241,32 +2242,32 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: addq $56, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -2274,146 +2275,148 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3,4],ymm14[5,6,7,8],ymm9[9],ymm14[10],ymm9[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2],ymm4[3,4],ymm15[5,6,7,8],ymm4[9],ymm15[10],ymm4[11,12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10],ymm6[11],ymm11[12,13],ymm6[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -2426,158 +2429,155 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $72, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2586,172 +2586,168 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $72, %rsp +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2760,15 +2756,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2940,63 +2935,63 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm7, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 @@ -3004,80 +2999,80 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3,4],ymm2[5,6,7,8],ymm5[9],ymm2[10],ymm5[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm5[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm13, %ymm15, %ymm13 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm13[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm19, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm17[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm17[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm3 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 @@ -3085,12 +3080,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3169,36 +3164,35 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $616, %rsp # imm = 0x268 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm14 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm15 +; SSE-NEXT: movdqa (%rsi), %xmm12 ; SSE-NEXT: movdqa 16(%rsi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 @@ -3206,24 +3200,24 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 @@ -3236,13 +3230,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 @@ -3250,11 +3244,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa 32(%rdx), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: movdqa 32(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm11 ; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 @@ -3267,13 +3261,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm8, %xmm3 @@ -3281,12 +3275,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm11 @@ -3298,29 +3292,29 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 64(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: movdqa 64(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm11 ; SSE-NEXT: movdqa 64(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill @@ -3330,13 +3324,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa 80(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 @@ -3344,31 +3338,31 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 80(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm15 ; SSE-NEXT: movdqa 80(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 @@ -3376,46 +3370,46 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 96(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm15 ; SSE-NEXT: movdqa 96(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa 112(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa 112(%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3423,417 +3417,419 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,1] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] ; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -3842,67 +3838,67 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm14 ; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -3911,81 +3907,80 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm6[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, 624(%r9) -; SSE-NEXT: movdqa %xmm13, 608(%r9) -; SSE-NEXT: movdqa %xmm0, 576(%r9) -; SSE-NEXT: movdqa %xmm10, 560(%r9) -; SSE-NEXT: movdqa %xmm15, 544(%r9) +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm10, 624(%r9) +; SSE-NEXT: movdqa %xmm0, 608(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%r9) +; SSE-NEXT: movdqa %xmm2, 560(%r9) +; SSE-NEXT: movdqa %xmm12, 544(%r9) ; SSE-NEXT: movdqa %xmm11, 528(%r9) ; SSE-NEXT: movdqa %xmm14, 496(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 480(%r9) +; SSE-NEXT: movdqa %xmm15, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%r9) @@ -4045,46 +4040,46 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm10[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm9 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 @@ -4092,79 +4087,79 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm14[1],xmm2[2,3,4,5],xmm14[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm8[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 @@ -4174,21 +4169,21 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] @@ -4202,8 +4197,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4211,63 +4206,63 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm9[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4276,7 +4271,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -4292,21 +4287,21 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] @@ -4320,72 +4315,72 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1,2,3,4],xmm6[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm11[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm5[2],xmm0[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4402,9 +4397,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] @@ -4420,8 +4415,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4434,30 +4429,30 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -4465,56 +4460,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm7[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm6[2],xmm12[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%r9) @@ -4541,9 +4536,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) @@ -4592,346 +4587,345 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3],ymm14[4],ymm0[5,6],ymm14[7],ymm0[8,9],ymm14[10],ymm0[11],ymm14[12],ymm0[13,14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3,4],ymm3[5,6,7,8],ymm8[9],ymm3[10],ymm8[11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3,4],ymm12[5,6,7,8],ymm14[9],ymm12[10],ymm14[11,12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3,4],ymm5[5,6,7,8],ymm9[9],ymm5[10],ymm9[11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3,4],ymm15[5,6,7,8],ymm0[9],ymm15[10],ymm0[11,12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm11[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13],ymm0[14],ymm8[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm9, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 544(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 384(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 544(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 384(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%r9) @@ -4949,8 +4943,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4970,344 +4963,337 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3,4],ymm15[5,6,7,8],ymm14[9],ymm15[10],ymm14[11,12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3,4],ymm14[5,6,7,8],ymm11[9],ymm14[10],ymm11[11,12],ymm14[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3,4],ymm14[5,6,7,8],ymm10[9],ymm14[10],ymm10[11,12],ymm14[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3,4],ymm7[5,6,7,8],ymm12[9],ymm7[10],ymm12[11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm11 ; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 544(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 384(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 608(%r9) +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 544(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 384(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5320,7 +5306,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 512(%r9) @@ -5345,344 +5331,337 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3,4],ymm15[5,6,7,8],ymm14[9],ymm15[10],ymm14[11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3,4],ymm14[5,6,7,8],ymm11[9],ymm14[10],ymm11[11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3,4],ymm14[5,6,7,8],ymm10[9],ymm14[10],ymm10[11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3,4],ymm7[5,6,7,8],ymm12[9],ymm7[10],ymm12[11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 384(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 608(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 608(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5695,7 +5674,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 512(%r9) @@ -5719,673 +5698,675 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm21[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3],xmm9[4],xmm4[5],xmm9[6],xmm4[7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm1[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm5, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4],ymm5[5,6,7,8],ymm4[9],ymm5[10],ymm4[11,12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm24 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3,4],ymm4[5,6,7,8],ymm0[9],ymm4[10],ymm0[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2],xmm11[3],xmm4[4,5],xmm11[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2],ymm9[3],ymm2[4,5],ymm9[6],ymm2[7,8],ymm9[9],ymm2[10],ymm9[11],ymm2[12,13],ymm9[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5],ymm8[6],ymm11[7,8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13],ymm8[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,1,1] -; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2],ymm8[3,4],ymm7[5,6,7,8],ymm8[9],ymm7[10],ymm8[11,12],ymm7[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm1, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm0[2],ymm6[3],ymm0[4],ymm6[5,6],ymm0[7],ymm6[8,9],ymm0[10],ymm6[11],ymm0[12],ymm6[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[1,1,2,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm21[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8,9],ymm3[10],ymm1[11],ymm3[12],ymm1[13,14],ymm3[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm21, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm28[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm17[2,3,2,2] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3],ymm11[4],ymm14[5,6],ymm11[7],ymm14[8,9],ymm11[10],ymm14[11],ymm11[12],ymm14[13,14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm9, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm9 -; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm20 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm20, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm11, %zmm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm18, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm19[2,3,2,2] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm24[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10],ymm1[11],ymm15[12,13],ymm1[14],ymm15[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm24[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm24, %zmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm22 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm24, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm3 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm20, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm22[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm27[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm25[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm24[0,1,0,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 +; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm21, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm30[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm28[0,1,0,0] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm19[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[2,2,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm29 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm19, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm29, %zmm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm17, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm17, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm17, %zmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm26[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm23[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm15, %zmm10, %zmm23 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm15, %zmm29, %zmm25 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm19, %zmm15 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm19, %zmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm15, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm17 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm19, %zmm21 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm15, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm27, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm14, %zmm19, %zmm8 ; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm19, %zmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm15, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm2, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm24, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm13, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 384(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 576(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 320(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 448(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 512(%r9) -; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 512(%r9) +; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-FAST-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm9 ; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,2,2,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,2,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm12[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm2, %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm30[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm30[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpandnq %ymm4, %ymm22, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7,8],ymm8[9],ymm0[10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm7, %ymm8 +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3],ymm0[4],ymm8[5,6],ymm0[7],ymm8[8,9],ymm0[10],ymm8[11],ymm0[12],ymm8[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3,4],ymm2[5,6,7,8],ymm6[9],ymm2[10],ymm6[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm30 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm7[1],ymm2[2],ymm7[3,4],ymm2[5,6,7,8],ymm7[9],ymm2[10],ymm7[11,12],ymm2[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5],ymm1[6],ymm11[7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13],ymm1[14],ymm11[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3],ymm11[4],ymm5[5,6],ymm11[7],ymm5[8,9],ymm11[10],ymm5[11],ymm11[12],ymm5[13,14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm9[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3],ymm9[4],ymm0[5,6],ymm9[7],ymm0[8,9],ymm9[10],ymm0[11],ymm9[12],ymm0[13,14],ymm9[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8],ymm0[9],ymm9[10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm11 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm30 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5],ymm0[6],ymm11[7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13],ymm0[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm17[0,1,0,0] -; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3],ymm15[4],ymm9[5,6],ymm15[7],ymm9[8,9],ymm15[10],ymm9[11],ymm15[12],ymm9[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10,11],ymm15[12],ymm3[13],ymm15[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm29[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm16[0,1,0,0] +; AVX512F-FAST-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm10[2],ymm3[3],ymm10[4],ymm3[5,6],ymm10[7],ymm3[8,9],ymm10[10],ymm3[11],ymm10[12],ymm3[13,14],ymm10[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7,8],ymm3[9],ymm10[10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm11[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10,11],ymm11[12],ymm4[13],ymm11[14],ymm4[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm20 -; AVX512F-FAST-NEXT: vpandnq %ymm20, %ymm31, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm16, %zmm20 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm18, %zmm29 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpandnq %ymm20, %ymm30, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm26 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm26, %zmm29 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm28, %zmm27 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[2,3,2,3] ; AVX512F-FAST-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm16 = mem[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm17 = mem[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm18 = mem[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm20 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm31[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm25[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm24[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm21[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm19[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm19 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm15 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm11 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm22, %zmm11 -; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm19[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm18[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm17, %zmm11 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm12 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm17, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm19, %zmm12 +; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm16 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm11, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm28, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm22, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm16, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm8 -; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm26, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm28, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm21, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm14 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm19, %zmm14 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm16, %zmm6 +; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm16 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm12, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm23, %zmm11, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm27, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 448(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 320(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%r9) -; AVX512F-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm9, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm3, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 320(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 512(%r9) +; AVX512F-FAST-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index c20981d0d9398..66fdffccdfddc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -875,37 +875,36 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rcx), %xmm2 ; SSE-NEXT: movdqa 16(%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 +; SSE-NEXT: andnps %xmm9, %xmm0 ; SSE-NEXT: orps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa %xmm15, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3] ; SSE-NEXT: movdqa (%r8), %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm8[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] ; SSE-NEXT: movdqa (%r9), %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: andnps %xmm1, %xmm0 @@ -917,11 +916,11 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: andnps %xmm4, %xmm0 @@ -938,41 +937,41 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: andnps %xmm3, %xmm7 ; SSE-NEXT: orps %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[0,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm13 ; SSE-NEXT: andps %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 ; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: por %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: andps %xmm3, %xmm4 @@ -1001,33 +1000,33 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] ; SSE-NEXT: andps %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm1, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[0,2] -; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] +; SSE-NEXT: andps %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 48(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) @@ -1035,7 +1034,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm11, 160(%rax) ; SSE-NEXT: movdqa %xmm15, (%rax) ; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: movdqa %xmm5, 64(%rax) +; SSE-NEXT: movdqa %xmm9, 64(%rax) ; SSE-NEXT: movdqa %xmm13, 144(%rax) ; SSE-NEXT: movaps %xmm7, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1049,93 +1048,93 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm10[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1,2],xmm15[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm3[0,1],xmm11[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -1144,28 +1143,28 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0],xmm3[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1176,7 +1175,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) @@ -1189,50 +1188,50 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm12 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] @@ -1240,9 +1239,9 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] @@ -1254,25 +1253,25 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] @@ -1280,15 +1279,15 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 @@ -1311,30 +1310,30 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm12 @@ -1353,14 +1352,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] @@ -1368,26 +1367,26 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] @@ -1421,120 +1420,128 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: subq $24, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $24, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1751,67 +1758,65 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm5 ; SSE-NEXT: movdqa 16(%rdx), %xmm14 ; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm10 -; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm3[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: andps %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm11, %xmm0 -; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm12[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm4, %xmm0 -; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%r8), %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm9[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm12, %xmm0 @@ -1820,292 +1825,293 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: orps %xmm11, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm14[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm9 -; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm1 -; SSE-NEXT: movdqa 32(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm13[0,1] +; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: andnps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: andnps %xmm9, %xmm10 +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: andnps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm12 +; SSE-NEXT: orps %xmm12, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: andnps %xmm1, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: orps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa 48(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 48(%r8), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[3,3] +; SSE-NEXT: movdqa 48(%r8), %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1] +; SSE-NEXT: movdqa 48(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: andnps %xmm13, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: andnps %xmm9, %xmm6 -; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm0, %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: andps %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: andps %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: andps %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm14, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 -; SSE-NEXT: andps %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: andps %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm11, %xmm8 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm12[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] +; SSE-NEXT: andps %xmm5, %xmm15 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: andps %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: andps %xmm5, %xmm11 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm15, %xmm12 -; SSE-NEXT: andps %xmm5, %xmm13 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm15, %xmm13 -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: andps %xmm5, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm14, %xmm13 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[0,2] -; SSE-NEXT: andps %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm5, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: andps %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm5, 352(%rax) -; SSE-NEXT: movdqa %xmm9, 336(%rax) -; SSE-NEXT: movdqa %xmm14, 304(%rax) +; SSE-NEXT: movdqa %xmm1, 336(%rax) +; SSE-NEXT: movdqa %xmm11, 304(%rax) ; SSE-NEXT: movdqa %xmm13, 288(%rax) -; SSE-NEXT: movdqa %xmm12, 256(%rax) -; SSE-NEXT: movdqa %xmm11, 240(%rax) -; SSE-NEXT: movdqa %xmm8, 208(%rax) -; SSE-NEXT: movdqa %xmm3, 192(%rax) -; SSE-NEXT: movdqa %xmm7, 160(%rax) -; SSE-NEXT: movdqa %xmm6, 144(%rax) +; SSE-NEXT: movdqa %xmm10, 256(%rax) +; SSE-NEXT: movdqa %xmm15, 240(%rax) +; SSE-NEXT: movdqa %xmm9, 208(%rax) +; SSE-NEXT: movdqa %xmm6, 192(%rax) +; SSE-NEXT: movdqa %xmm8, 160(%rax) +; SSE-NEXT: movdqa %xmm7, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2134,7 +2140,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $280, %rsp # imm = 0x118 +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf32: @@ -2146,133 +2152,133 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm12[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm13[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm12[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4],xmm0[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 @@ -2317,28 +2323,28 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm13[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] @@ -2358,39 +2364,39 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0],xmm6[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0],xmm7[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) @@ -2433,24 +2439,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] @@ -2460,274 +2464,267 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 @@ -2751,22 +2748,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -2775,19 +2773,20 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2797,11 +2796,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2811,202 +2811,202 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 @@ -3036,16 +3036,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3055,45 +3055,45 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3102,169 +3102,165 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm11 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm14[4],ymm8[4],ymm14[5],ymm8[5],ymm14[6],ymm8[6],ymm14[7],ymm8[7],ymm14[12],ymm8[12],ymm14[13],ymm8[13],ymm14[14],ymm8[14],ymm14[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[2],ymm8[2],ymm14[3],ymm8[3],ymm14[8],ymm8[8],ymm14[9],ymm8[9],ymm14[10],ymm8[10],ymm14[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3273,13 +3269,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3288,7 +3284,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 @@ -3303,12 +3299,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm12[0],ymm5[1],ymm12[1],ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[8],ymm12[8],ymm5[9],ymm12[9],ymm5[10],ymm12[10],ymm5[11],ymm12[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 @@ -3316,58 +3312,58 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm16[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] @@ -3380,7 +3376,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm13[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] @@ -3388,99 +3384,99 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm12[4],ymm5[5],ymm12[5],ymm5[6],ymm12[6],ymm5[7],ymm12[7],ymm5[12],ymm12[12],ymm5[13],ymm12[13],ymm5[14],ymm12[14],ymm5[15],ymm12[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm31[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm4[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm9[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm14[4],ymm10[4],ymm14[5],ymm10[5],ymm14[6],ymm10[6],ymm14[7],ymm10[7],ymm14[12],ymm10[12],ymm14[13],ymm10[13],ymm14[14],ymm10[14],ymm14[15],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm29[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[2,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -3490,55 +3486,55 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm24[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm23[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm23[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -3546,16 +3542,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: pushq %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm23 @@ -3565,212 +3561,212 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,9,2,3,8,5,6,11] ; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm0[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm25, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm16, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm20, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm10, %ymm16, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm1[4],ymm15[5],ymm1[5],ymm15[6],ymm1[6],ymm15[7],ymm1[7],ymm15[12],ymm1[12],ymm15[13],ymm1[13],ymm15[14],ymm1[14],ymm15[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[8],ymm3[8],ymm13[9],ymm3[9],ymm13[10],ymm3[10],ymm13[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[12],ymm11[12],ymm0[13],ymm11[13],ymm0[14],ymm11[14],ymm0[15],ymm11[15] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm16, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm31, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm26, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm13[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm12[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm9[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm5, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm31, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm1[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm14[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm19, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm25[0,1,2,3],zmm22[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm19[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm16[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm10[0,1,2,3],zmm2[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm19, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm16[0,1,2,3],zmm17[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm28, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm15[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm12[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm6, %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -3780,7 +3776,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 @@ -3799,16 +3795,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm12[0],ymm5[1],ymm12[1],ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[8],ymm12[8],ymm5[9],ymm12[9],ymm5[10],ymm12[10],ymm5[11],ymm12[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 @@ -3818,51 +3814,51 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm30 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm16, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm16, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm9 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm9 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3],ymm13[4],ymm6[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -3870,161 +3866,161 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm4 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm9, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm9[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm12[4],ymm5[5],ymm12[5],ymm5[6],ymm12[6],ymm5[7],ymm12[7],ymm5[12],ymm12[12],ymm5[13],ymm12[13],ymm5[14],ymm12[14],ymm5[15],ymm12[15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm4, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm5, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm28[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm27[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm28[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm27[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm5, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -4044,209 +4040,212 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm3, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm22 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm9, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm9, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm5[4],ymm15[5],ymm5[5],ymm15[6],ymm5[6],ymm15[7],ymm5[7],ymm15[12],ymm5[12],ymm15[13],ymm5[13],ymm15[14],ymm5[14],ymm15[15],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm13[4],ymm4[5],ymm13[5],ymm4[6],ymm13[6],ymm4[7],ymm13[7],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[2],ymm2[2],ymm10[3],ymm2[3],ymm10[8],ymm2[8],ymm10[9],ymm2[9],ymm10[10],ymm2[10],ymm10[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[8],ymm5[8],ymm12[9],ymm5[9],ymm12[10],ymm5[10],ymm12[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 ; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm23, %zmm21 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm9, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm24 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm0[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm26, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm25, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm26, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm28 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm26, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm5[0],ymm15[1],ymm5[1],ymm15[2],ymm5[2],ymm15[3],ymm5[3],ymm15[8],ymm5[8],ymm15[9],ymm5[9],ymm15[10],ymm5[10],ymm15[11],ymm5[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm8, %zmm15 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm7, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm25 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm28, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm14, %ymm24, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm13, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm13, %ymm27, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm13[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512DQ-FAST-NEXT: vpermd %zmm7, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm11 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm12, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm22[0,1,2,3],zmm20[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm25[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm27[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm2, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm27, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm22[0,1,2,3],zmm20[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm23[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm26[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm12[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-FAST-NEXT: addq $40, %rsp ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -4358,487 +4357,498 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $792, %rsp # imm = 0x318 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa 16(%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[3,3] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: andps %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: andnps %xmm7, %xmm5 -; SSE-NEXT: orps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm11[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 -; SSE-NEXT: orps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: andnps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm8[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa 32(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: orps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: andnps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: movdqa 48(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm3[3,3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 48(%r8), %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm11, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm8 -; SSE-NEXT: orps %xmm8, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm8 -; SSE-NEXT: andnps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: movdqa 64(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 64(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 64(%r8), %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1] -; SSE-NEXT: movdqa 64(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm12, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm11 -; SSE-NEXT: orps %xmm11, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 80(%r8), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 80(%r8), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1] -; SSE-NEXT: movdqa 80(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm15, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm12 -; SSE-NEXT: orps %xmm12, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm11[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm11, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: movdqa 96(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: movdqa 96(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 96(%r8), %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] -; SSE-NEXT: movdqa 96(%r9), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm13, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[0,1] +; SSE-NEXT: movdqa 96(%r9), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: andnps %xmm0, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: orps %xmm15, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm12, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa 112(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rsi), %xmm13 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm4 +; SSE-NEXT: movdqa 112(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm8[3,3] -; SSE-NEXT: movdqa 112(%r8), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 112(%r8), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] ; SSE-NEXT: movdqa 112(%r9), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: andnps %xmm10, %xmm12 +; SSE-NEXT: andnps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm15 ; SSE-NEXT: orps %xmm15, %xmm12 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm4[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; SSE-NEXT: andnps %xmm10, %xmm14 -; SSE-NEXT: orps %xmm1, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: andnps %xmm1, %xmm14 +; SSE-NEXT: orps %xmm0, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm13[0,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm9 +; SSE-NEXT: andps %xmm10, %xmm9 ; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[0,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] @@ -4847,9 +4857,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -4857,11 +4867,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4871,18 +4881,18 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm6 +; SSE-NEXT: andps %xmm10, %xmm6 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -4896,9 +4906,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -4906,9 +4916,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -4920,71 +4930,69 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm7 +; SSE-NEXT: andps %xmm10, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4996,9 +5004,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] @@ -5006,10 +5014,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5021,72 +5029,72 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm11 +; SSE-NEXT: andps %xmm10, %xmm11 ; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: andps %xmm12, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: andps %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,2] -; SSE-NEXT: andps %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2] +; SSE-NEXT: andps %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm12, 736(%rax) -; SSE-NEXT: movdqa %xmm8, 720(%rax) +; SSE-NEXT: movdqa %xmm10, 736(%rax) +; SSE-NEXT: movdqa %xmm12, 720(%rax) ; SSE-NEXT: movdqa %xmm15, 688(%rax) ; SSE-NEXT: movdqa %xmm11, 672(%rax) ; SSE-NEXT: movdqa %xmm4, 640(%rax) ; SSE-NEXT: movdqa %xmm6, 624(%rax) -; SSE-NEXT: movdqa %xmm10, 592(%rax) +; SSE-NEXT: movdqa %xmm9, 592(%rax) ; SSE-NEXT: movdqa %xmm13, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) @@ -5108,7 +5116,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rax) @@ -5120,7 +5128,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) @@ -5167,7 +5175,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $792, %rsp # imm = 0x318 +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf64: @@ -5606,13 +5614,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -5647,22 +5655,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm6[0,1],xmm1[0],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm1[0],xmm7[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -5679,7 +5687,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rax) @@ -5771,260 +5779,262 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -6032,22 +6042,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -6056,10 +6066,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] @@ -6110,7 +6121,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -6127,206 +6138,209 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 736(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 672(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 736(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 672(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 512(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 448(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6355,23 +6369,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1592, %rsp # imm = 0x638 +; AVX2-FAST-NEXT: subq $1560, %rsp # imm = 0x618 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6381,223 +6396,222 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] @@ -6607,18 +6621,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6631,8 +6647,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6640,9 +6656,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -6653,18 +6667,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[8],ymm12[8],ymm1[9],ymm12[9],ymm1[10],ymm12[10],ymm1[11],ymm12[11] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[1],ymm14[1],ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[8],ymm14[8],ymm3[9],ymm14[9],ymm3[10],ymm14[10],ymm3[11],ymm14[11] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6692,173 +6707,174 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 736(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6866,15 +6882,15 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 352(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 640(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 640(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6903,71 +6919,73 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $1592, %rsp # imm = 0x638 +; AVX2-FAST-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -6975,7 +6993,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -6983,18 +7001,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7007,18 +7026,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7031,7 +7051,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7040,10 +7060,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7056,7 +7076,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7064,11 +7084,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7081,19 +7101,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7106,355 +7126,354 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, (%rsp), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[8],ymm1[8],ymm7[9],ymm1[9],ymm7[10],ymm1[10],ymm7[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm12 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[4],mem[4],ymm6[5],mem[5],ymm6[6],mem[6],ymm6[7],mem[7],ymm6[12],mem[12],ymm6[13],mem[13],ymm6[14],mem[14],ymm6[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 736(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 704(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 672(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 704(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 672(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 608(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7471,21 +7490,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-PERLANE-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $408, %rsp # imm = 0x198 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: subq $392, %rsp # imm = 0x188 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] @@ -7499,10 +7518,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 @@ -7518,961 +7537,958 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm0[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm5[2],ymm11[3,4],ymm5[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm6[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm6[1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm11[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm4[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm10[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm14[1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm11[0,1,2,3],zmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[8],ymm1[8],ymm14[9],ymm1[9],ymm14[10],ymm1[10],ymm14[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm29, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm12[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm25 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm15 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm27, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm16 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm8, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm27, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm22, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm8, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm27, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm24, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm6, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm25, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm20, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm10, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm12, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm30 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm19, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm22, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm27, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[2],ymm13[2],ymm8[3],ymm13[3],ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm21, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm27, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm13[4],ymm8[5],ymm13[5],ymm8[6],ymm13[6],ymm8[7],ymm13[7],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[2,2,2,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm18, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm15[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm25, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[12],ymm0[12],ymm8[13],ymm0[13],ymm8[14],ymm0[14],ymm8[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[8],ymm14[8],ymm4[9],ymm14[9],ymm4[10],ymm14[10],ymm4[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm18, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm19, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm18[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm16[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm27[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm18, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm25[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm14[0,1,2,3],zmm29[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm24[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm18 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm31, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm16[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm20[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm22[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm21[0,1,2,3],zmm17[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm24[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm28[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm27[0,1,2,3],zmm6[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm13[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm21 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm9[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm11, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # xmm22 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm2, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm26, %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm27, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm30[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm23[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm28 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm29[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm21, %ymm4, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm4, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm27, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm28, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm30[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm23[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm19[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,2,3],zmm26[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $408, %rsp # imm = 0x198 +; AVX512DQ-SLOW-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 @@ -8481,7 +8497,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] @@ -8495,10 +8511,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 @@ -8519,1053 +8535,1045 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,0,2,2,5,4,6,6] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm6, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm10, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm11[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm11, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm8, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm13[0],ymm10[0],ymm13[1],ymm10[1],ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[8],ymm10[8],ymm13[9],ymm10[9],ymm13[10],ymm10[10],ymm13[11],ymm10[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm10, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[12],ymm12[12],ymm8[13],ymm12[13],ymm8[14],ymm12[14],ymm8[15],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[12],ymm7[12],ymm4[13],ymm7[13],ymm4[14],ymm7[14],ymm4[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm10, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm11, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm11[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm11 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm14, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm13, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm11[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[8],ymm7[8],ymm4[9],ymm7[9],ymm4[10],ymm7[10],ymm4[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[8],ymm12[8],ymm8[9],ymm12[9],ymm8[10],ymm12[10],ymm8[11],ymm12[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm14, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm12, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm11[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm25 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm13, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm8, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm24 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm24, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm18 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm29, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm16 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm15 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm2 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $920, %rsp # imm = 0x398 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-FAST-NEXT: subq $888, %rsp # imm = 0x378 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm23 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm17, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm18, %ymm12 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm21 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm6, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm5, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm20, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm19, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm20, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm18, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm29 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm16, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm17, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm29 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm25 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm9, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm22, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm8, %zmm11 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm22, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm18, %ymm6 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm10 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm13 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm7, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm19, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm5[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm9, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm16, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm7[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm12[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm17 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm12, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm16, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm15, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm13 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm12, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm16 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm21[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[0,1,2,3],zmm29[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm27, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm31, %zmm24 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm24 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm0[0,1,2,3],zmm20[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm25 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm11 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm30[0],zero,xmm30[1],zero,xmm30[2],zero,xmm30[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm15, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm21[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm27[0,1,2,3],zmm29[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm21, %zmm27 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm20[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm30 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm11 ; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm9, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm7, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm20, %ymm13 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm25[0,1,2,3],zmm26[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm20, %ymm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm16 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm3, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm15, %ymm20 +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm25[0,1,2,3],zmm26[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm15, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm3 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm31 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm25, %zmm27 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm11[0,1,2,3],zmm28[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm31 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm28[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm5, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm14 ; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm11 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm15, %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm3[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm4, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm25, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm18[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm16[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm15[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm29, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm7[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm28, %zmm25, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm16[0,1,2,3],zmm23[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm16, %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm22[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm19[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm17[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm2 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512DQ-FAST-NEXT: addq $920, %rsp # imm = 0x398 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 704(%rax) +; AVX512DQ-FAST-NEXT: addq $888, %rsp # imm = 0x378 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm22, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm24, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 982c07752b6b5..fe4c2504228a2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -172,77 +172,77 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: por %xmm12, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: psrld $16, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; SSE-NEXT: psrlq $48, %xmm4 ; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: andnps %xmm3, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movq %xmm5, 48(%rax) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movq %xmm7, 48(%rax) ; SSE-NEXT: movdqa %xmm6, 32(%rax) -; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf4: @@ -583,53 +583,53 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm5 ; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa (%rax), %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 ; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm12[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm13 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2] -; SSE-NEXT: andps %xmm10, %xmm14 -; SSE-NEXT: andnps %xmm13, %xmm10 -; SSE-NEXT: orps %xmm14, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] +; SSE-NEXT: andps %xmm9, %xmm14 +; SSE-NEXT: andnps %xmm13, %xmm9 +; SSE-NEXT: orps %xmm14, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm12 @@ -641,7 +641,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm12, %xmm15 @@ -653,52 +653,51 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm15, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm15 ; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; SSE-NEXT: por %xmm12, %xmm15 ; SSE-NEXT: psrlq $48, %xmm11 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: psrld $16, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3] -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm3[1,1] -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm12, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm2[1,1] +; SSE-NEXT: pandn %xmm13, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1] @@ -706,24 +705,24 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: andps %xmm4, %xmm6 ; SSE-NEXT: andnps %xmm0, %xmm4 ; SSE-NEXT: orps %xmm6, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm12[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm5, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm4, 64(%rax) ; SSE-NEXT: movdqa %xmm15, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movdqa %xmm9, 96(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movdqa %xmm10, 96(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf8: @@ -1193,336 +1192,338 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp +; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa 16(%rax), %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm15 +; SSE-NEXT: movdqa 16(%rcx), %xmm1 +; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa 16(%r9), %xmm7 +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 -; SSE-NEXT: orps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm14 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm13 ; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm12 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm9[0,2] -; SSE-NEXT: andps %xmm7, %xmm12 -; SSE-NEXT: orps %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,2,2,2] -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: orps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,2,2] +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm10[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: andnps %xmm9, %xmm15 -; SSE-NEXT: orps %xmm4, %xmm15 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,6,7] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm7[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm5, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,2],xmm13[1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andnps %xmm15, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm13 -; SSE-NEXT: orps %xmm15, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[0,1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] -; SSE-NEXT: andps %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm9[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm11[1,1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: andnps %xmm9, %xmm6 +; SSE-NEXT: orps %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,1] +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: orps %xmm9, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,1] +; SSE-NEXT: andps %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, 112(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm0, 176(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 112(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: movdqa %xmm1, 176(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1533,10 +1534,10 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movdqa %xmm11, 96(%rax) +; SSE-NEXT: movdqa %xmm8, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf16: @@ -1545,70 +1546,70 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm9[1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1,2,3,4,5,6],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 @@ -1616,31 +1617,31 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm10[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm4[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm12[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm1 @@ -1649,30 +1650,30 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,2],xmm13[1,3] ; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] @@ -1699,7 +1700,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -1709,12 +1710,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] @@ -1724,7 +1725,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 @@ -1743,7 +1744,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -1770,26 +1771,28 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax +; AVX2-SLOW-NEXT: subq $40, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 @@ -1797,6 +1800,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = @@ -1807,23 +1811,23 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 @@ -1835,51 +1839,54 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm12 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] @@ -1896,63 +1903,68 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: popq %rax +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2035,8 +2047,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 @@ -2044,8 +2056,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] @@ -2055,11 +2067,11 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] @@ -2094,7 +2106,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] @@ -2119,7 +2131,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) ; AVX2-FAST-NEXT: popq %rax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2133,18 +2145,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = @@ -2152,7 +2166,6 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> @@ -2167,9 +2180,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = @@ -2185,8 +2198,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] @@ -2205,21 +2218,22 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> @@ -2232,7 +2246,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> @@ -2244,20 +2258,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] @@ -2265,19 +2279,19 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] @@ -2290,7 +2304,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2649,7 +2663,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: subq $680, %rsp # imm = 0x2A8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2657,30 +2671,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm1 ; SSE-NEXT: movdqa 48(%rcx), %xmm5 -; SSE-NEXT: movdqa 48(%r8), %xmm6 +; SSE-NEXT: movdqa 48(%r8), %xmm9 ; SSE-NEXT: movdqa 48(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm9 +; SSE-NEXT: movaps 48(%rax), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm0 @@ -2688,473 +2703,469 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm14, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm10 +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 ; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm10 +; SSE-NEXT: movdqa 32(%r9), %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 32(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm0, %xmm14 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[3,3] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm10, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm10 -; SSE-NEXT: orps %xmm4, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm5 +; SSE-NEXT: orps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm14 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm5[1,1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3162,22 +3173,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: andnps %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm8 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm10[1,1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3185,21 +3195,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm12, %xmm10 +; SSE-NEXT: andnps %xmm1, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3217,13 +3228,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: orps %xmm0, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] @@ -3236,7 +3246,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3244,117 +3254,114 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm13 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm13 +; SSE-NEXT: orps %xmm1, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,1] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: orps %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: andps %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm4, %xmm10 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 336(%rax) -; SSE-NEXT: movdqa %xmm12, 224(%rax) -; SSE-NEXT: movdqa %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm9, (%rax) -; SSE-NEXT: movdqa %xmm0, 288(%rax) -; SSE-NEXT: movaps %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movdqa %xmm6, 336(%rax) +; SSE-NEXT: movdqa %xmm9, 224(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm2, 288(%rax) +; SSE-NEXT: movaps %xmm13, 176(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm6, 368(%rax) +; SSE-NEXT: movaps %xmm12, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3362,7 +3369,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) -; SSE-NEXT: movaps %xmm5, 256(%rax) +; SSE-NEXT: movaps %xmm10, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3370,7 +3377,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm8, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3382,7 +3390,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 320(%rax) +; SSE-NEXT: movdqa %xmm15, 320(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3393,77 +3401,77 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 400(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: addq $680, %rsp # imm = 0x2A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm13 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -3474,34 +3482,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 @@ -3513,72 +3521,71 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm1[3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm5[0,2],xmm1[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -3587,103 +3594,101 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm11[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm15[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -3691,12 +3696,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 @@ -3722,213 +3727,213 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm1[3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps $80, (%rsp), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $600, %rsp # imm = 0x258 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 @@ -3956,27 +3961,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> @@ -3987,13 +3992,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4002,7 +4007,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4013,115 +4018,113 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4131,7 +4134,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 @@ -4251,48 +4256,53 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $600, %rsp # imm = 0x258 +; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-FAST-NEXT: subq $312, %rsp # imm = 0x138 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> @@ -4301,21 +4311,26 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -4324,132 +4339,129 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7,8,9],ymm2[10],ymm15[11,12],ymm2[13],ymm15[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm15, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -4457,18 +4469,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -4476,192 +4488,195 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1],xmm8[2],xmm13[3,4],xmm8[5],xmm13[6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-FAST-NEXT: addq $312, %rsp # imm = 0x138 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 @@ -4687,13 +4702,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] @@ -4705,8 +4720,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4716,11 +4731,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4729,7 +4744,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4743,107 +4758,106 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4853,7 +4867,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 @@ -4908,31 +4923,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 @@ -4953,23 +4968,23 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 256(%rax) @@ -4983,997 +4998,1008 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $792, %rsp # imm = 0x318 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: subq $824, %rsp # imm = 0x338 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm2 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,7,6] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermi2d %zmm11, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7,8,9,10],ymm0[11],ymm6[12,13],ymm0[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm31 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7,8,9],ymm4[10],ymm12[11,12],ymm4[13],ymm12[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm4, %ymm6, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm12, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7,8,9],ymm10[10],ymm6[11,12],ymm10[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7,8,9,10],ymm0[11],ymm12[12,13],ymm0[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm4[1],xmm13[2,3],xmm4[4],xmm13[5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm31[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm18[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7,8,9],ymm15[10],ymm4[11,12],ymm15[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm27[0,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6,7,8],ymm9[9],ymm0[10,11],ymm9[12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm16[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6,7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[2,1,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm26[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm31 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm27, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm13[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm9 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm22, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm4, %zmm10 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm31[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm24, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm25, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm20, %zmm9 +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) -; AVX512F-SLOW-NEXT: addq $792, %rsp # imm = 0x318 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-SLOW-NEXT: addq $824, %rsp # imm = 0x338 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm7, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm3, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,12,13,u,15> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,u,u,u,7,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm14, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm26[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm21[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm20[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vporq %ymm6, %ymm7, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm3, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,12,13,u,15> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm9, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm29, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,u,u,u,7,u,u,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm14, %ymm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,2,2,3] ; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm26[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm21[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm20[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -6135,1449 +6161,1472 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1656, %rsp # imm = 0x678 +; SSE-NEXT: subq $1640, %rsp # imm = 0x668 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm1 ; SSE-NEXT: movdqa 96(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm8 -; SSE-NEXT: movdqa 112(%r8), %xmm5 -; SSE-NEXT: movdqa 112(%r9), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rax), %xmm11 +; SSE-NEXT: movdqa 112(%rcx), %xmm6 +; SSE-NEXT: movdqa 112(%r8), %xmm4 +; SSE-NEXT: movdqa 112(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rax), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: andps %xmm1, %xmm4 +; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa 96(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa 96(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movdqa 96(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm12 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm14 +; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%r8), %xmm0 -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm14 +; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 48(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rsi), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 32(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 48(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm3 +; SSE-NEXT: movdqa 64(%r9), %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 64(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm9, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 80(%rax), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 80(%r8), %xmm1 -; SSE-NEXT: movdqa 80(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa 80(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 80(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 80(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 80(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: movaps %xmm14, %xmm11 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm9, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,2],mem[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: orps %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[1,1] -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm9, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: andnps %xmm1, %xmm9 -; SSE-NEXT: orps %xmm7, %xmm9 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: andps %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm9, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movaps %xmm12, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movaps %xmm14, %xmm9 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm9 ; SSE-NEXT: orps %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm8 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm8 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm7 ; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm5 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm5 ; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,1] -; SSE-NEXT: andps %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: andps %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 672(%rax) +; SSE-NEXT: movdqa %xmm4, 672(%rax) ; SSE-NEXT: movdqa %xmm15, 560(%rax) -; SSE-NEXT: movdqa %xmm10, 448(%rax) -; SSE-NEXT: movdqa %xmm11, 336(%rax) -; SSE-NEXT: movdqa %xmm12, 224(%rax) -; SSE-NEXT: movdqa %xmm13, 112(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm0, 736(%rax) -; SSE-NEXT: movaps %xmm3, 624(%rax) -; SSE-NEXT: movaps %xmm5, 512(%rax) -; SSE-NEXT: movaps %xmm7, 400(%rax) -; SSE-NEXT: movaps %xmm8, 288(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movdqa %xmm13, 448(%rax) +; SSE-NEXT: movdqa %xmm3, 336(%rax) +; SSE-NEXT: movdqa %xmm6, 224(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movdqa %xmm2, 736(%rax) +; SSE-NEXT: movaps %xmm5, 624(%rax) +; SSE-NEXT: movaps %xmm7, 512(%rax) +; SSE-NEXT: movaps %xmm9, 400(%rax) +; SSE-NEXT: movaps %xmm11, 288(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7664,144 +7713,145 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: addq $1656, %rsp # imm = 0x678 +; SSE-NEXT: addq $1640, %rsp # imm = 0x668 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm3[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 ; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm4[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm3[3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 @@ -7824,9 +7874,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7841,9 +7891,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 @@ -7859,29 +7909,28 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -7896,27 +7945,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] @@ -7934,8 +7982,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] @@ -7943,7 +7992,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -7956,264 +8005,268 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm8[6],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm4[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm8[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm8[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm15[6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 @@ -8221,33 +8274,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm1[3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] @@ -8255,44 +8309,43 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm14[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 @@ -8308,8 +8361,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8326,9 +8379,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 @@ -8357,11 +8410,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8378,17 +8431,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm12 @@ -8404,28 +8457,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload @@ -8446,14 +8498,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8470,11 +8522,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] @@ -8487,67 +8539,70 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm4[1],xmm11[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8664,13 +8719,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 848(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1720, %rsp # imm = 0x6B8 +; AVX2-SLOW-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8680,120 +8735,121 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm11, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <3,u,u,u,4,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] @@ -8819,16 +8875,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -8850,13 +8905,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -8881,255 +8936,259 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3],xmm4[4],xmm14[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[1,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -9153,94 +9212,118 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] @@ -9249,210 +9332,189 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm5[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8,9,10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7,8,9],ymm15[10],ymm1[11,12],ymm15[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm14, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 544(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 320(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 640(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 608(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 608(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 416(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9489,23 +9551,23 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-SLOW-NEXT: addq $1720, %rsp # imm = 0x6B8 +; AVX2-SLOW-NEXT: addq $1688, %rsp # imm = 0x698 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1240, %rsp # imm = 0x4D8 +; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] @@ -9513,13 +9575,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm1 @@ -9535,305 +9597,314 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7,8,9],ymm3[10],ymm14[11,12],ymm3[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8,9,10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7,8,9,10],ymm1[11],ymm11[12,13],ymm1[14],ymm11[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4> @@ -9851,15 +9922,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] @@ -9882,9 +9953,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -9904,15 +9975,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -9926,317 +9998,315 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm12, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3],xmm1[4],xmm11[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm6[2],xmm11[3,4],xmm6[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 104(%rax), %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 72(%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 104(%rax), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 544(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 640(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 576(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 384(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 768(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 704(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 544(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 608(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 576(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 416(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 384(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 768(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 704(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 512(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 480(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 448(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10247,7 +10317,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-NEXT: addq $1240, %rsp # imm = 0x4D8 +; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -10256,189 +10326,188 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -10451,7 +10520,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -10463,243 +10532,241 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm15, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm9[1],xmm15[2,3],xmm9[4],xmm15[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm9, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1],xmm3[2],xmm13[3,4],xmm3[5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -10723,17 +10790,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] @@ -10762,45 +10829,46 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7,8,9,10],ymm11[11],ymm7[12,13],ymm11[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -10808,12 +10876,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] @@ -10821,61 +10889,62 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10883,74 +10952,74 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] @@ -11018,570 +11087,562 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: subq $2456, %rsp # imm = 0x998 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] -; AVX512F-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm11[2,2,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512F-SLOW-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] +; AVX512F-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8,9,10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vprold $16, %ymm13, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm20, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero,ymm7[u,u],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm12 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[2,2,2,2] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm12[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,2,2,4,5,6,6] ; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpandnq %ymm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm19, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpandn %ymm7, %ymm14, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpandnq %ymm4, %ymm25, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm12 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm25, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm29 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7,8,9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7,8,9,10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7,8,9,10],ymm8[11],ymm2[12,13],ymm8[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-SLOW-NEXT: vprold $16, %ymm18, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm18, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vprold $16, %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vprold $16, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm16 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm27, %zmm28 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm13, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm7[2],xmm13[3,4],xmm7[5],xmm13[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm30 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm13[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm7[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3],xmm1[4],xmm15[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm27[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm31 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm4[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm6[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm6[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm2, %zmm12 +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm12[1],xmm15[2,3],xmm12[4],xmm15[5,6],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm29[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm29[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm25[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm23[2,2,2,3] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,2] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 @@ -11592,32 +11653,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm29[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm4[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm25, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm20 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm2, %zmm20 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm25, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm2, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm26 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm23 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm22 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm16[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm0 @@ -11627,811 +11688,797 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm24, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm21 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm7 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $180, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] ; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm8 = mem[0,0,1,1] ; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm15 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] ; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = mem[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = mem[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm30 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm15, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm25, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 256(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512F-SLOW-NEXT: addq $2456, %rsp # imm = 0x998 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm12, %ymm11, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,u,u,u,6,u,u,6> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,ymm10[u,u],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm24, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm9, %ymm23, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,1,1,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm17, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8,9,10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm23, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3],xmm3[4],xmm7[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm5, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7,8,9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm10[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm12[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm28[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm26[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm23[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm21[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm10, %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm10, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm20, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,1,3,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload @@ -12442,44 +12489,45 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,0,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -12487,684 +12535,669 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512DQ-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vporq %ymm12, %ymm11, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,u,u,u,6,u,u,6> +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,ymm10[u,u],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512DQ-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm9 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm24, %ymm9 +; AVX512DQ-FAST-NEXT: vpandnq %ymm9, %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 +; AVX512DQ-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,1,1,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm17, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8,9,10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512DQ-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm10 +; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm23, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm16 ; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm16, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3],xmm3[4],xmm7[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm5, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7,8,9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm24 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm20 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm10[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm12[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm28[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm26[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm23[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm21[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm20, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm25 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,1,3,3] ; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload @@ -13175,44 +13208,45 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,0,1,3] ; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -13221,223 +13255,222 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $136, %rsp ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm13, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm30, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm15 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm28, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm14, %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm4, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm12, %zmm31 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm4, %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm12, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm22 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm21, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm28 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm2 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-NEXT: addq $136, %rsp diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 36c79fa995dcc..e5a6eea44b8ad 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -538,7 +538,7 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: movdqa (%r8), %xmm4 ; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: movdqa (%r10), %xmm3 ; SSE-NEXT: movdqa (%rax), %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -546,14 +546,14 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] ; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] @@ -574,14 +574,14 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] @@ -590,10 +590,10 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -604,7 +604,7 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movaps %xmm6, 48(%rax) ; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf8: @@ -849,20 +849,19 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: subq $88, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa (%rsi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa (%rcx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa (%r10), %xmm10 +; SSE-NEXT: movdqa (%r10), %xmm11 ; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movdqa %xmm8, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] @@ -873,9 +872,9 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] @@ -885,142 +884,145 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%r10), %xmm6 -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movdqa 16(%r10), %xmm15 +; SSE-NEXT: movdqa 16(%rax), %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] ; SSE-NEXT: movdqa 16(%r8), %xmm4 ; SSE-NEXT: movdqa 16(%r9), %xmm11 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm1, 224(%rax) ; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movapd %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movapd %xmm10, 160(%rax) +; SSE-NEXT: movaps %xmm8, 176(%rax) ; SSE-NEXT: movapd %xmm13, 96(%rax) ; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movapd %xmm14, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movapd %xmm9, 192(%rax) +; SSE-NEXT: movaps %xmm11, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -1028,7 +1030,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: addq $88, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf16: @@ -1036,138 +1038,138 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm14, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] @@ -1208,37 +1210,35 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] @@ -1251,26 +1251,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero @@ -1282,64 +1282,63 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1348,7 +1347,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1357,123 +1355,123 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: pushq %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm14[4],ymm12[5],ymm14[5],ymm12[6],ymm14[6],ymm12[7],ymm14[7],ymm12[12],ymm14[12],ymm12[13],ymm14[13],ymm12[14],ymm14[14],ymm12[15],ymm14[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1488,37 +1486,35 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] @@ -1531,26 +1527,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero @@ -1562,64 +1558,63 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1628,7 +1623,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1636,56 +1630,55 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 ; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 ; AVX512F-NEXT: vmovdqa (%r8), %ymm15 ; AVX512F-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm6 -; AVX512F-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-NEXT: vmovdqa (%r10), %xmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-NEXT: vmovdqa (%r10), %ymm4 +; AVX512F-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-NEXT: vmovdqa (%rax), %xmm5 +; AVX512F-NEXT: vmovdqa (%r10), %xmm6 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa (%r9), %xmm5 ; AVX512F-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqa (%rcx), %xmm11 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm17 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm13 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm19 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 @@ -1695,25 +1688,25 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> ; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512F-NEXT: movb $-86, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1784,150 +1777,151 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $264, %rsp # imm = 0x108 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 ; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa (%r8), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa (%r9), %xmm9 ; SSE-NEXT: movdqa (%r10), %xmm7 ; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; SSE-NEXT: movdqa 16(%r9), %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movdqa 16(%r9), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movdqa 16(%r10), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movdqa 16(%rax), %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 ; SSE-NEXT: movdqa 32(%rax), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -1937,21 +1931,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -1962,16 +1957,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -1998,54 +1993,54 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 48(%r10), %xmm7 +; SSE-NEXT: movdqa 48(%r10), %xmm9 ; SSE-NEXT: movdqa 48(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 48(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 48(%r8), %xmm4 ; SSE-NEXT: movdqa 48(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 48(%rdx), %xmm6 ; SSE-NEXT: movdqa 48(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -2053,26 +2048,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 496(%rax) -; SSE-NEXT: movapd %xmm4, 480(%rax) +; SSE-NEXT: movapd %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm3, 464(%rax) ; SSE-NEXT: movapd %xmm1, 448(%rax) -; SSE-NEXT: movaps %xmm8, 432(%rax) -; SSE-NEXT: movapd %xmm9, 416(%rax) +; SSE-NEXT: movaps %xmm7, 432(%rax) +; SSE-NEXT: movapd %xmm8, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) ; SSE-NEXT: movapd %xmm11, 384(%rax) ; SSE-NEXT: movaps %xmm14, 368(%rax) @@ -2183,21 +2178,21 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] @@ -2221,17 +2216,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2240,36 +2235,36 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] @@ -2289,105 +2284,105 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -2429,8 +2424,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) @@ -2508,27 +2503,27 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -2547,8 +2542,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero @@ -2613,123 +2608,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2763,73 +2758,74 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 @@ -2837,40 +2833,40 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -2878,137 +2874,138 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3091,27 +3088,27 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -3130,8 +3127,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero @@ -3196,123 +3193,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3345,145 +3342,145 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm30 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm26, %zmm30 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm27, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm18, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm19, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm19, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm20, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm20, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm21, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm19, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm20, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm21, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm26, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm27, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm28, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm19, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[8],ymm6[8],ymm1[9],ymm6[9],ymm1[10],ymm6[10],ymm1[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] ; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm20, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm4 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm28, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: subq $472, %rsp # imm = 0x1D8 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 @@ -3495,9 +3492,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 @@ -3517,42 +3514,41 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm25 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm25 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm23 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 @@ -3561,124 +3557,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm31 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm30 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm28 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm31, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm25 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm31, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm0, %zmm17 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm16 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm22 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm24 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm1, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm31, %zmm6 +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm31, %zmm7 ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm1, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm11, %zmm9 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm11, %zmm12 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm11, %zmm3 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm11, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm5 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm6, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm6, %zmm14 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm0, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm12, %zmm22 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm12, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm16 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm5, %zmm14 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm9 {%k2} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-FAST-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3825,149 +3820,150 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $776, %rsp # imm = 0x308 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm3 ; SSE-NEXT: movdqa (%rcx), %xmm9 ; SSE-NEXT: movdqa (%r8), %xmm4 ; SSE-NEXT: movdqa (%r9), %xmm10 ; SSE-NEXT: movdqa (%r10), %xmm8 ; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movdqa 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa 16(%r10), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa 16(%rax), %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 ; SSE-NEXT: movdqa 32(%rax), %xmm4 @@ -3978,21 +3974,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4003,16 +4000,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4050,21 +4047,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: movdqa 48(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4075,16 +4073,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4122,21 +4120,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: movdqa 64(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rsi), %xmm10 +; SSE-NEXT: movdqa 64(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4147,16 +4146,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4194,21 +4193,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rsi), %xmm10 +; SSE-NEXT: movdqa 80(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4219,16 +4219,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4266,21 +4266,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: movdqa 96(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm10 +; SSE-NEXT: movdqa 96(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4291,16 +4292,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4327,54 +4328,54 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 112(%r10), %xmm7 +; SSE-NEXT: movdqa 112(%r10), %xmm9 ; SSE-NEXT: movdqa 112(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 112(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 112(%r8), %xmm4 ; SSE-NEXT: movdqa 112(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm6 ; SSE-NEXT: movdqa 112(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -4382,26 +4383,26 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 1008(%rax) -; SSE-NEXT: movapd %xmm4, 992(%rax) +; SSE-NEXT: movapd %xmm5, 992(%rax) ; SSE-NEXT: movaps %xmm3, 976(%rax) ; SSE-NEXT: movapd %xmm1, 960(%rax) -; SSE-NEXT: movaps %xmm8, 944(%rax) -; SSE-NEXT: movapd %xmm9, 928(%rax) +; SSE-NEXT: movaps %xmm7, 944(%rax) +; SSE-NEXT: movapd %xmm8, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) ; SSE-NEXT: movapd %xmm11, 896(%rax) ; SSE-NEXT: movaps %xmm14, 880(%rax) @@ -4535,12 +4536,12 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -4548,36 +4549,36 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm5[3],ymm13[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm6[3],ymm13[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 @@ -4589,9 +4590,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 @@ -4599,11 +4600,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,1,0,1] @@ -4614,7 +4615,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4622,17 +4623,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm5[3],ymm9[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -4649,9 +4650,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] @@ -4661,32 +4662,32 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4706,17 +4707,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4743,22 +4744,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4782,13 +4783,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -5204,14 +5205,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] @@ -5221,20 +5222,20 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero @@ -5244,23 +5245,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %xmm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %xmm8 @@ -5275,13 +5276,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] @@ -5294,47 +5295,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero @@ -5358,16 +5359,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -5391,13 +5392,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] @@ -5418,9 +5419,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 @@ -5435,8 +5436,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] @@ -5454,227 +5455,227 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm1[4],ymm12[5],ymm1[5],ymm12[6],ymm1[6],ymm12[7],ymm1[7],ymm12[12],ymm1[12],ymm12[13],ymm1[13],ymm12[14],ymm1[14],ymm12[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[8],ymm4[8],ymm9[9],ymm4[9],ymm9[10],ymm4[10],ymm9[11],ymm4[11] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3,4],ymm1[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4],ymm15[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 992(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 928(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 928(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5738,14 +5739,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -5755,103 +5756,102 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5869,19 +5869,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,2,2,2,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm5 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5889,23 +5891,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,1,1,1,1,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5916,7 +5918,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm8 @@ -5925,8 +5927,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5934,6 +5935,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] @@ -5963,109 +5965,110 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[2],ymm8[2],ymm15[3],ymm8[3],ymm15[8],ymm8[8],ymm15[9],ymm8[9],ymm15[10],ymm8[10],ymm15[11],ymm8[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[8],ymm4[8],ymm14[9],ymm4[9],ymm14[10],ymm4[10],ymm14[11],ymm4[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm15[4],ymm8[4],ymm15[5],ymm8[5],ymm15[6],ymm8[6],ymm15[7],ymm8[7],ymm15[12],ymm8[12],ymm15[13],ymm8[13],ymm15[14],ymm8[14],ymm15[15],ymm8[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm15, %ymm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] @@ -6073,61 +6076,59 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 64(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6144,34 +6145,35 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 96(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] @@ -6181,17 +6183,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] @@ -6307,14 +6309,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] @@ -6324,20 +6326,20 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero @@ -6347,23 +6349,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %xmm8 @@ -6378,13 +6380,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] @@ -6397,47 +6399,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero @@ -6461,16 +6463,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -6494,13 +6496,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] @@ -6521,9 +6523,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm4 @@ -6538,8 +6540,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] @@ -6557,227 +6559,227 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm1[4],ymm12[5],ymm1[5],ymm12[6],ymm1[6],ymm12[7],ymm1[7],ymm12[12],ymm1[12],ymm12[13],ymm1[13],ymm12[14],ymm1[14],ymm12[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[8],ymm4[8],ymm9[9],ymm4[9],ymm9[10],ymm4[10],ymm9[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4],ymm15[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 992(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 928(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 928(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6838,661 +6840,651 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm29, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm10 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm14, %zmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm15, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm31 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm16, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm29, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm16, %zmm31 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm14, %zmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm15, %zmm28 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm23 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm30, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm30 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm30 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm19, %zmm28 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm14, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm22 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm14, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm15, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm21 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm29 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm15, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm16, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm10 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm30, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm30, %zmm17 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm1, %zmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm15 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm30, %zmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm13, %zmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm27, %zmm9 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm27, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm2 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm27, %zmm11 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 576(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[12],ymm8[12],ymm5[13],ymm8[13],ymm5[14],ymm8[14],ymm5[15],ymm8[15] ; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm27 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm28 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm11 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm28 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm0, %zmm25 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm5 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm0, %zmm27 +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm1, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm5 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm19 {%k2} ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm18 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm26 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm30 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm12, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm21 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm29 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm29 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm5, %zmm23 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm12, %zmm23 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm20 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm20 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm12, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm28 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm29 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm23 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm23 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm10, %zmm20 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm13, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm13, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm13, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm11, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm22, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm26 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm22, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm22, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm22, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm22, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm11, %zmm10 {%k1} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7502,285 +7494,290 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm29 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm7 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} ; AVX512BW-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index d19eba1cbd51d..cabe23547039b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -436,49 +436,49 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 64(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rsi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movaps 48(%rsi), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -490,62 +490,62 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movaps 176(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 208(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 240(%rsi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, 496(%rdx) ; SSE-NEXT: movaps %xmm0, 480(%rdx) -; SSE-NEXT: movaps %xmm2, 464(%rdx) -; SSE-NEXT: movaps %xmm4, 448(%rdx) -; SSE-NEXT: movaps %xmm1, 432(%rdx) -; SSE-NEXT: movaps %xmm6, 416(%rdx) -; SSE-NEXT: movaps %xmm7, 400(%rdx) -; SSE-NEXT: movaps %xmm8, 384(%rdx) -; SSE-NEXT: movaps %xmm9, 368(%rdx) +; SSE-NEXT: movaps %xmm1, 464(%rdx) +; SSE-NEXT: movaps %xmm2, 448(%rdx) +; SSE-NEXT: movaps %xmm5, 432(%rdx) +; SSE-NEXT: movaps %xmm7, 416(%rdx) +; SSE-NEXT: movaps %xmm6, 400(%rdx) +; SSE-NEXT: movaps %xmm9, 384(%rdx) +; SSE-NEXT: movaps %xmm8, 368(%rdx) ; SSE-NEXT: movaps %xmm11, 352(%rdx) -; SSE-NEXT: movaps %xmm12, 336(%rdx) -; SSE-NEXT: movaps %xmm13, 320(%rdx) -; SSE-NEXT: movaps %xmm10, 304(%rdx) -; SSE-NEXT: movaps %xmm15, 288(%rdx) -; SSE-NEXT: movaps %xmm14, 272(%rdx) +; SSE-NEXT: movaps %xmm10, 336(%rdx) +; SSE-NEXT: movaps %xmm12, 320(%rdx) +; SSE-NEXT: movaps %xmm13, 304(%rdx) +; SSE-NEXT: movaps %xmm14, 288(%rdx) +; SSE-NEXT: movaps %xmm15, 272(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index a4482bafbd535..afe43e3c7379d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -191,42 +191,42 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm6 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps 16(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf8: @@ -374,83 +374,83 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm7 ; SSE-NEXT: movaps 16(%rsi), %xmm9 ; SSE-NEXT: movaps 32(%rsi), %xmm10 ; SSE-NEXT: movaps 48(%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movaps 48(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm10[3,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] ; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) ; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm14, 96(%rcx) +; SSE-NEXT: movaps %xmm12, 96(%rcx) ; SSE-NEXT: movaps %xmm10, 112(%rcx) ; SSE-NEXT: movaps %xmm6, 144(%rcx) ; SSE-NEXT: movaps %xmm11, 160(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] ; SSE-NEXT: movaps %xmm2, 80(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 128(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 176(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 128(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: movaps %xmm5, 176(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf16: @@ -705,9 +705,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm12 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 +; SSE-NEXT: movaps (%rsi), %xmm10 +; SSE-NEXT: movaps 16(%rsi), %xmm13 +; SSE-NEXT: movaps 32(%rsi), %xmm12 ; SSE-NEXT: movaps 48(%rsi), %xmm9 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps 16(%rdx), %xmm6 @@ -715,47 +715,47 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 48(%rdx), %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm11 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] ; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1] ; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] @@ -772,62 +772,63 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm1[0,2] -; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[0,2] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdx), %xmm10 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdx), %xmm13 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[0,3] +; SSE-NEXT: movaps 96(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 112(%rdx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[0,3] ; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -840,19 +841,19 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm10[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps %xmm4, 336(%rcx) -; SSE-NEXT: movaps %xmm6, 304(%rcx) -; SSE-NEXT: movaps %xmm7, 288(%rcx) +; SSE-NEXT: movaps %xmm3, 336(%rcx) +; SSE-NEXT: movaps %xmm5, 304(%rcx) +; SSE-NEXT: movaps %xmm6, 288(%rcx) ; SSE-NEXT: movaps %xmm8, 256(%rcx) ; SSE-NEXT: movaps %xmm11, 240(%rcx) ; SSE-NEXT: movaps %xmm12, 208(%rcx) -; SSE-NEXT: movaps %xmm13, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -869,12 +870,12 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: movaps %xmm3, 368(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 320(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 272(%rcx) +; SSE-NEXT: movaps %xmm0, 368(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 320(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 272(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] ; SSE-NEXT: movaps %xmm9, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -886,8 +887,8 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq @@ -1064,25 +1065,25 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] @@ -1096,9 +1097,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) @@ -1117,7 +1118,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 @@ -1142,22 +1143,22 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] @@ -1168,10 +1169,10 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7] ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] @@ -1197,11 +1198,11 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm3, 288(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm12, 160(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm5, 256(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) @@ -1269,25 +1270,25 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] @@ -1301,9 +1302,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) @@ -1371,80 +1372,81 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm12 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps 48(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: movaps 16(%rdx), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm9 +; SSE-NEXT: movaps 48(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1458,10 +1460,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1475,10 +1478,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1492,10 +1496,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1509,10 +1514,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1525,231 +1531,238 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 160(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps 176(%rdi), %xmm12 +; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm15[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 176(%rdx), %xmm2 +; SSE-NEXT: movaps 176(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 176(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 192(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps 208(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 208(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movaps 208(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rdx), %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[0,3] +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 240(%rdx), %xmm9 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] -; SSE-NEXT: movaps 224(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[0,3] +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm12[0,3] -; SSE-NEXT: movaps 240(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm12[2,3] -; SSE-NEXT: movaps %xmm1, 736(%rcx) -; SSE-NEXT: movaps %xmm4, 720(%rcx) -; SSE-NEXT: movaps %xmm5, 688(%rcx) -; SSE-NEXT: movaps %xmm6, 672(%rcx) -; SSE-NEXT: movaps %xmm7, 640(%rcx) -; SSE-NEXT: movaps %xmm10, 624(%rcx) -; SSE-NEXT: movaps %xmm14, 592(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 576(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 544(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 528(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 752(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm0, 736(%rcx) +; SSE-NEXT: movaps %xmm3, 720(%rcx) +; SSE-NEXT: movaps %xmm4, 688(%rcx) +; SSE-NEXT: movaps %xmm7, 672(%rcx) +; SSE-NEXT: movaps %xmm8, 640(%rcx) +; SSE-NEXT: movaps %xmm10, 624(%rcx) +; SSE-NEXT: movaps %xmm11, 592(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 544(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 704(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 656(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: movaps %xmm11, 608(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: movaps %xmm15, 560(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 512(%rcx) +; SSE-NEXT: movaps %xmm2, 752(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: movaps %xmm5, 704(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps %xmm6, 656(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 464(%rcx) -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,3] +; SSE-NEXT: movaps %xmm13, 608(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps %xmm12, 560(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] +; SSE-NEXT: movaps %xmm14, 512(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 464(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 416(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] @@ -1763,7 +1776,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 224(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2024,63 +2037,62 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2092,8 +2104,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2104,25 +2116,26 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 160(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2143,7 +2156,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 192(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -2164,51 +2177,50 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] @@ -2224,19 +2236,18 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 512(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 320(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 320(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 640(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rcx) @@ -2269,95 +2280,95 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 @@ -2368,74 +2379,74 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7] ; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0],ymm4[1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] ; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps %ymm7, 736(%rcx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm10, 704(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 672(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 608(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 544(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 512(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 480(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 672(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 640(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 608(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 576(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 544(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 512(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 480(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm8, 448(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm15, 416(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm14, 384(%rcx) @@ -2469,63 +2480,62 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2537,8 +2547,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2549,25 +2559,26 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 160(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2588,7 +2599,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 192(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -2609,51 +2620,50 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] @@ -2669,19 +2679,18 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 512(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 320(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index 94f9a018fe9e4..bf6991e907177 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -521,14 +521,14 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -555,9 +555,9 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm14[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm15[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 @@ -566,8 +566,8 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm15[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm14[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] @@ -596,8 +596,8 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm1[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -836,12 +836,13 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: movaps %xmm13, %xmm0 @@ -850,98 +851,97 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm1 -; SSE-NEXT: movaps 96(%rcx), %xmm4 +; SSE-NEXT: movaps 96(%rcx), %xmm6 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movaps %xmm6, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movaps 112(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm8 +; SSE-NEXT: movaps 112(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, 496(%r8) -; SSE-NEXT: movaps %xmm7, 480(%r8) +; SSE-NEXT: movaps %xmm4, 480(%r8) ; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps %xmm4, 448(%r8) -; SSE-NEXT: movaps %xmm3, 432(%r8) -; SSE-NEXT: movaps %xmm11, 416(%r8) -; SSE-NEXT: movaps %xmm6, 400(%r8) -; SSE-NEXT: movaps %xmm13, 384(%r8) -; SSE-NEXT: movaps %xmm5, 368(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%r8) -; SSE-NEXT: movaps %xmm15, 336(%r8) +; SSE-NEXT: movaps %xmm3, 448(%r8) +; SSE-NEXT: movaps %xmm5, 432(%r8) +; SSE-NEXT: movaps %xmm10, 416(%r8) +; SSE-NEXT: movaps %xmm9, 400(%r8) +; SSE-NEXT: movaps %xmm12, 384(%r8) +; SSE-NEXT: movaps %xmm11, 368(%r8) +; SSE-NEXT: movaps %xmm15, 352(%r8) +; SSE-NEXT: movaps %xmm8, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movaps %xmm10, 304(%r8) +; SSE-NEXT: movaps %xmm13, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movaps %xmm12, 272(%r8) +; SSE-NEXT: movaps %xmm14, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm14, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -973,7 +973,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 @@ -999,7 +999,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1015,12 +1015,13 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1031,26 +1032,27 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1061,41 +1063,39 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm13[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm8[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1112,6 +1112,19 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] @@ -1124,72 +1137,61 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm9[2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[3,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm11[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm11[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1208,54 +1210,54 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 @@ -1263,85 +1265,85 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm3[0],ymm14[1],ymm3[1],ymm14[4],ymm3[4],ymm14[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) @@ -1576,142 +1578,142 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 64(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 ; SSE-NEXT: movaps 80(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm5 ; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm0 ; SSE-NEXT: movaps 96(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm5 ; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm0 ; SSE-NEXT: movaps 112(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm0 ; SSE-NEXT: movaps 128(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps 128(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm0 ; SSE-NEXT: movaps 144(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm5 ; SSE-NEXT: movaps 144(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm0 ; SSE-NEXT: movaps 160(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps 160(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm0 ; SSE-NEXT: movaps 176(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -1733,90 +1735,90 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 192(%rdx), %xmm0 ; SSE-NEXT: movaps 192(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps 192(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movaps 192(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 208(%rdx), %xmm0 ; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps 208(%rsi), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: movaps 208(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdx), %xmm1 -; SSE-NEXT: movaps 224(%rcx), %xmm2 +; SSE-NEXT: movaps 224(%rcx), %xmm6 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm7 -; SSE-NEXT: movaps 224(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movaps 240(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm10 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, 1008(%r8) -; SSE-NEXT: movaps %xmm5, 992(%r8) +; SSE-NEXT: movaps %xmm4, 992(%r8) ; SSE-NEXT: movaps %xmm1, 976(%r8) -; SSE-NEXT: movaps %xmm4, 960(%r8) -; SSE-NEXT: movaps %xmm7, 944(%r8) -; SSE-NEXT: movaps %xmm11, 928(%r8) -; SSE-NEXT: movaps %xmm8, 912(%r8) -; SSE-NEXT: movaps %xmm14, 896(%r8) -; SSE-NEXT: movaps %xmm3, 880(%r8) +; SSE-NEXT: movaps %xmm3, 960(%r8) +; SSE-NEXT: movaps %xmm5, 944(%r8) +; SSE-NEXT: movaps %xmm10, 928(%r8) +; SSE-NEXT: movaps %xmm9, 912(%r8) +; SSE-NEXT: movaps %xmm11, 896(%r8) +; SSE-NEXT: movaps %xmm13, 880(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r8) -; SSE-NEXT: movaps %xmm6, 848(%r8) +; SSE-NEXT: movaps %xmm8, 848(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%r8) ; SSE-NEXT: movaps %xmm12, 816(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%r8) -; SSE-NEXT: movaps %xmm13, 784(%r8) +; SSE-NEXT: movaps %xmm14, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r8) ; SSE-NEXT: movaps %xmm15, 752(%r8) @@ -2081,7 +2083,7 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] @@ -2116,25 +2118,25 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2145,11 +2147,11 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm4[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2163,210 +2165,210 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm7[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[1],xmm7[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm6[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm9[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -2374,8 +2376,8 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm2, 928(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 864(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 800(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 736(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 672(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 672(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2603,49 +2605,49 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -2684,13 +2686,13 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm1, 864(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 832(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 704(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 704(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 608(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 576(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 17bd3eb320104..4bb7f80a79564 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -684,160 +684,160 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm10 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm12 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm14 +; SSE-NEXT: movaps 32(%rcx), %xmm12 ; SSE-NEXT: movaps (%r8), %xmm3 ; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 32(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa 48(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps 48(%rcx), %xmm9 ; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movaps %xmm14, %xmm10 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm7, 288(%r9) -; SSE-NEXT: movaps %xmm3, 272(%r9) +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,0] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm9, 288(%r9) +; SSE-NEXT: movaps %xmm4, 272(%r9) ; SSE-NEXT: movdqa %xmm5, 240(%r9) ; SSE-NEXT: movaps %xmm15, 208(%r9) -; SSE-NEXT: movaps %xmm11, 192(%r9) +; SSE-NEXT: movaps %xmm12, 192(%r9) ; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm13, 128(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) -; SSE-NEXT: movdqa %xmm12, 80(%r9) +; SSE-NEXT: movaps %xmm14, 128(%r9) +; SSE-NEXT: movaps %xmm10, 112(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -846,16 +846,16 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm4, 256(%r9) +; SSE-NEXT: movaps %xmm2, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps %xmm3, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm10, 96(%r9) +; SSE-NEXT: movaps %xmm7, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) +; SSE-NEXT: movaps %xmm13, 16(%r9) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; @@ -866,33 +866,33 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm6[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm8[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -910,7 +910,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm11[2],xmm10[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,1],ymm2[1,1],ymm9[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] @@ -931,21 +931,21 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm9[2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm9 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm14[3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm14[2],xmm13[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1],xmm13[1,1] @@ -957,11 +957,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] @@ -969,15 +969,15 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1118,116 +1118,116 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm4 ; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2],ymm4[3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm14, 288(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm10, 256(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm13, 288(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm8, 256(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1509,21 +1509,21 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: subq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm11 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps 16(%rcx), %xmm5 ; SSE-NEXT: movaps 32(%rcx), %xmm6 ; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps 32(%r8), %xmm12 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] @@ -1532,58 +1532,58 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm8 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps 48(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 64(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 80(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 @@ -1595,22 +1595,22 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 96(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rcx), %xmm15 ; SSE-NEXT: movaps 112(%r8), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1618,14 +1618,14 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm11 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] @@ -1633,7 +1633,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1641,106 +1641,107 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm15[1,1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm12[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] @@ -1756,37 +1757,34 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] ; SSE-NEXT: movaps 112(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] @@ -1802,7 +1800,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1815,40 +1813,39 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 608(%r9) -; SSE-NEXT: movaps %xmm12, 592(%r9) +; SSE-NEXT: movaps %xmm1, 608(%r9) +; SSE-NEXT: movaps %xmm11, 592(%r9) ; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm7, 528(%r9) +; SSE-NEXT: movaps %xmm6, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) ; SSE-NEXT: movaps %xmm8, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps %xmm9, 432(%r9) +; SSE-NEXT: movaps %xmm12, 432(%r9) ; SSE-NEXT: movaps %xmm13, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) @@ -1888,10 +1885,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm6, 416(%r9) +; SSE-NEXT: movaps %xmm7, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm11, 336(%r9) +; SSE-NEXT: movaps %xmm9, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) ; SSE-NEXT: movaps %xmm10, 256(%r9) @@ -1905,136 +1902,135 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm5[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm5[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm6[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm11[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm3[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0],xmm2[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm6[1,2,3],ymm13[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm14[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm0[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0],xmm1[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm12[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm12[0],xmm13[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm9[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1],ymm13[1,1],ymm5[5,5],ymm13[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm7[2],xmm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1],ymm15[1,1],ymm7[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm7[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,3],ymm6[3,3],ymm9[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4],ymm9[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4],ymm9[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm11[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm3[3,3] @@ -2042,24 +2038,24 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3],ymm4[3,3],ymm12[7,7],ymm4[7,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3],ymm4[3,3],ymm14[7,7],ymm4[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm0[2],xmm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2068,110 +2064,109 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm11[1,1],ymm9[5,5],ymm11[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4],ymm0[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,3],ymm8[3,3],ymm5[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm3[1,2,3,4],ymm11[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm11[2],ymm4[3,4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,zero,xmm14[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1],ymm14[1,1],ymm3[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm12[1,1],ymm13[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm15[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,3],ymm2[3,3],ymm15[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm5[1,2,3,4],ymm0[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[3,3],ymm1[3,3],ymm4[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4],ymm3[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3],ymm11[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm11[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1,2,3],ymm11[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm12[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -2195,7 +2190,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2310,11 +2305,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2325,128 +2320,128 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, 544(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 384(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm15, 608(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 384(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 608(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%r9) @@ -2478,47 +2473,47 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: subq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm11 ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm10[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm11 @@ -2531,13 +2526,12 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm11 @@ -2545,153 +2539,154 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm7 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2],ymm7[3,4],ymm10[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3,4],ymm8[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2,3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2],ymm15[3,4],ymm6[5,6],ymm15[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] @@ -2700,32 +2695,32 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 544(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm3, 384(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, 384(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm11, 608(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%r9) @@ -2751,7 +2746,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: addq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2866,11 +2861,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2881,128 +2876,128 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 384(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 608(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 608(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) @@ -3262,54 +3257,55 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps 32(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r8), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm12 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm13 +; SSE-NEXT: movaps 48(%r8), %xmm7 ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 @@ -3318,10 +3314,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm14 +; SSE-NEXT: movaps 64(%rcx), %xmm13 ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3332,10 +3328,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rcx), %xmm14 ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3431,7 +3427,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] @@ -3479,47 +3475,47 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 16(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3527,13 +3523,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -3551,47 +3547,46 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm13[1,1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movaps 80(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -3601,16 +3596,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -3707,13 +3702,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] @@ -3721,7 +3716,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3729,43 +3724,43 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3773,62 +3768,62 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] @@ -3848,11 +3843,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3896,10 +3891,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3910,27 +3905,27 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3947,17 +3942,17 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1248(%r9) ; SSE-NEXT: movaps %xmm3, 1232(%r9) ; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm8, 1168(%r9) +; SSE-NEXT: movaps %xmm7, 1168(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1152(%r9) -; SSE-NEXT: movaps %xmm9, 1120(%r9) +; SSE-NEXT: movaps %xmm8, 1120(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) -; SSE-NEXT: movaps %xmm10, 1072(%r9) -; SSE-NEXT: movaps %xmm12, 1040(%r9) +; SSE-NEXT: movaps %xmm9, 1072(%r9) +; SSE-NEXT: movaps %xmm13, 1040(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 960(%r9) @@ -4041,22 +4036,22 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, 1136(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%r9) -; SSE-NEXT: movaps %xmm7, 1056(%r9) +; SSE-NEXT: movaps %xmm12, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm11, 976(%r9) +; SSE-NEXT: movaps %xmm10, 976(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm13, 896(%r9) +; SSE-NEXT: movaps %xmm11, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r9) -; SSE-NEXT: movaps %xmm15, 816(%r9) +; SSE-NEXT: movaps %xmm14, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r9) ; SSE-NEXT: movaps %xmm2, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm14, 656(%r9) +; SSE-NEXT: movaps %xmm15, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4079,7 +4074,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -4095,72 +4090,72 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1784, %rsp # imm = 0x6F8 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[0],xmm8[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm14[1],xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm14[0],xmm12[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm15[1],xmm14[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm15[0],xmm14[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm13[0],xmm10[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm13[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm5[1],xmm11[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0],xmm11[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm7[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 @@ -4173,31 +4168,31 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm9[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm4[1,2,3],ymm6[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm8[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm8[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm4[1,2,3],ymm9[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm9 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm4[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm4[1],xmm0[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3],ymm15[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3],ymm0[4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4205,15 +4200,15 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm6[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm6[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm9[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4230,110 +4225,108 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm0[0],xmm1[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm7[2],xmm8[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm15[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm14[1,1],ymm15[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm10[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4341,31 +4334,32 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4387,28 +4381,28 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm9[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4418,11 +4412,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 @@ -4430,11 +4424,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 @@ -4444,7 +4438,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 164(%rcx), %xmm1 @@ -4483,7 +4477,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4491,9 +4485,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm6[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm9[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4505,8 +4499,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 @@ -4515,12 +4510,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] @@ -4543,61 +4538,59 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm5[1,1],ymm4[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0],ymm3[1,2,3,4],ymm10[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4633,30 +4626,30 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1024(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1024(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 864(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4671,7 +4664,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) @@ -4683,7 +4676,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) @@ -4936,18 +4929,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] @@ -4955,162 +4948,160 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 @@ -5122,7 +5113,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5130,104 +5121,103 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -5242,20 +5232,19 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1184(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 864(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 544(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1056(%r9) @@ -5322,302 +5311,297 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm15 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm6 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4,5],ymm5[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 192(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [0,1,3,2,3,2,3,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2],ymm11[3,4],ymm15[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm10[1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm15 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3,4],ymm9[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3,4],ymm6[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5630,35 +5614,35 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 128(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5671,32 +5655,34 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 160(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1,2],ymm1[3,4],ymm11[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5709,30 +5695,30 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 192(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5742,11 +5728,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -5755,75 +5741,73 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4],ymm11[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3,4],ymm8[5,6],ymm11[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3,4],ymm4[5,6],ymm9[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 248(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm15[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2,3],ymm4[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] @@ -5835,23 +5819,24 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] @@ -5861,13 +5846,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, 1024(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm10, 864(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm11, 384(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm4, 544(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm9, 384(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6149,18 +6133,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] @@ -6168,162 +6152,160 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 @@ -6335,7 +6317,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6343,104 +6325,103 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -6455,20 +6436,19 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1184(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 864(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1056(%r9) @@ -6534,423 +6514,429 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} ; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 576(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1216(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 4aee13aae36f9..96543b486b35d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -180,36 +180,36 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm5[3,3] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2] +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm2, 32(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf4: @@ -335,73 +335,76 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm8 ; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm6 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm5 ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm4 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; SSE-NEXT: movaps (%r9), %xmm7 +; SSE-NEXT: movaps 16(%r9), %xmm3 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,0] -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] ; SSE-NEXT: movaps %xmm2, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm4[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm8[3,3] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm11[0,2] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm15, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm10[0,2] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm14, 160(%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm13, 160(%rax) ; SSE-NEXT: movaps %xmm2, 176(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps %xmm15, 128(%rax) -; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) +; SSE-NEXT: movaps %xmm11, 80(%rax) +; SSE-NEXT: movaps %xmm10, 128(%rax) +; SSE-NEXT: movaps %xmm12, 144(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf8: @@ -771,139 +774,139 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm8 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm9 +; SSE-NEXT: movaps 16(%rdx), %xmm10 ; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3] +; SSE-NEXT: movaps 16(%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps 16(%r9), %xmm13 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,3] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movaps 32(%r8), %xmm2 -; SSE-NEXT: movaps 32(%r9), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movaps 32(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rcx), %xmm9 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: movaps 48(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm9 ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: movaps 48(%r8), %xmm3 +; SSE-NEXT: movaps 48(%r8), %xmm1 ; SSE-NEXT: movaps 48(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 368(%rax) -; SSE-NEXT: movaps %xmm10, 352(%rax) -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps %xmm4, 320(%rax) +; SSE-NEXT: movaps %xmm3, 368(%rax) +; SSE-NEXT: movaps %xmm9, 352(%rax) +; SSE-NEXT: movaps %xmm2, 336(%rax) +; SSE-NEXT: movaps %xmm5, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) -; SSE-NEXT: movaps %xmm5, 288(%rax) +; SSE-NEXT: movaps %xmm4, 288(%rax) ; SSE-NEXT: movaps %xmm13, 272(%rax) ; SSE-NEXT: movaps %xmm8, 256(%rax) ; SSE-NEXT: movaps %xmm12, 240(%rax) -; SSE-NEXT: movaps %xmm15, 224(%rax) +; SSE-NEXT: movaps %xmm14, 224(%rax) ; SSE-NEXT: movaps %xmm11, 208(%rax) -; SSE-NEXT: movaps %xmm14, 192(%rax) +; SSE-NEXT: movaps %xmm15, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -934,7 +937,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 @@ -953,15 +956,15 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] @@ -999,34 +1002,34 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] @@ -1073,21 +1076,21 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) @@ -1104,17 +1107,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1125,7 +1128,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1134,12 +1137,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -1147,11 +1150,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1159,12 +1162,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1174,7 +1177,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm1 @@ -1188,56 +1191,56 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] @@ -1251,7 +1254,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -1262,7 +1265,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1284,15 +1287,15 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 @@ -1311,7 +1314,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] @@ -1326,7 +1329,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm2 @@ -1334,14 +1337,14 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1351,7 +1354,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 @@ -1364,34 +1367,34 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm3[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] @@ -1404,17 +1407,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[4],ymm15[4],ymm7[5],ymm15[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] @@ -1423,10 +1426,10 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] @@ -1461,17 +1464,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1482,7 +1485,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1491,12 +1494,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -1504,11 +1507,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1516,12 +1519,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1531,7 +1534,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm1 @@ -1545,56 +1548,56 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] @@ -1608,7 +1611,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -1619,7 +1622,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2097,13 +2100,13 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps 48(%r9), %xmm3 @@ -2117,26 +2120,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps 64(%r9), %xmm3 @@ -2150,26 +2153,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps 80(%r9), %xmm3 @@ -2183,84 +2186,84 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm9 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps 96(%r9), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: movaps 96(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm3 +; SSE-NEXT: movaps 112(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm10 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 112(%r8), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps 112(%r8), %xmm1 ; SSE-NEXT: movaps 112(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 752(%rax) -; SSE-NEXT: movaps %xmm14, 736(%rax) -; SSE-NEXT: movaps %xmm0, 720(%rax) -; SSE-NEXT: movaps %xmm4, 704(%rax) +; SSE-NEXT: movaps %xmm3, 752(%rax) +; SSE-NEXT: movaps %xmm10, 736(%rax) +; SSE-NEXT: movaps %xmm2, 720(%rax) +; SSE-NEXT: movaps %xmm5, 704(%rax) ; SSE-NEXT: movaps %xmm6, 688(%rax) -; SSE-NEXT: movaps %xmm5, 672(%rax) -; SSE-NEXT: movaps %xmm11, 656(%rax) -; SSE-NEXT: movaps %xmm9, 640(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm12, 608(%rax) +; SSE-NEXT: movaps %xmm4, 672(%rax) +; SSE-NEXT: movaps %xmm9, 656(%rax) +; SSE-NEXT: movaps %xmm8, 640(%rax) +; SSE-NEXT: movaps %xmm11, 624(%rax) +; SSE-NEXT: movaps %xmm14, 608(%rax) ; SSE-NEXT: movaps %xmm15, 592(%rax) ; SSE-NEXT: movaps %xmm13, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2340,13 +2343,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1016, %rsp # imm = 0x3F8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 @@ -2373,10 +2377,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2388,9 +2391,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2400,15 +2403,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2418,11 +2422,11 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 @@ -2438,16 +2442,15 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2457,16 +2460,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] @@ -2475,130 +2478,95 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -2608,97 +2576,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 576(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2721,7 +2727,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1016, %rsp # imm = 0x3F8 +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2768,7 +2774,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm5 @@ -2786,9 +2792,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -2801,19 +2808,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -2827,174 +2834,175 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 64(%rcx), %xmm13 -; AVX2-SLOW-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 64(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastd 64(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[6],ymm0[6],ymm10[7],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3004,32 +3012,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3039,50 +3045,51 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 672(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 480(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3113,340 +3120,337 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $888, %rsp # imm = 0x378 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: subq $872, %rsp # imm = 0x368 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm7 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm8 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4],ymm0[5],ymm8[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm10 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm11[1],ymm14[2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm14 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm11 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[4],ymm12[4],ymm15[5],ymm12[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 672(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 640(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 544(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 544(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 352(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) @@ -3476,7 +3480,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $888, %rsp # imm = 0x378 +; AVX2-FAST-NEXT: addq $872, %rsp # imm = 0x368 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3523,7 +3527,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm5 @@ -3541,9 +3545,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -3556,19 +3561,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -3582,174 +3587,175 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[6],ymm0[6],ymm10[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3759,32 +3765,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3794,50 +3798,51 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 672(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 480(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3869,135 +3874,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512F-SLOW-NEXT: movb $36, %dl ; AVX512F-SLOW-NEXT: kmovw %edx, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512F-SLOW-NEXT: movb $-110, %cl ; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -4005,138 +4010,138 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -4144,135 +4149,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512BW-SLOW-NEXT: movb $36, %dl ; AVX512BW-SLOW-NEXT: kmovd %edx, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-SLOW-NEXT: movb $-110, %cl ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -4280,138 +4285,138 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4535,13 +4540,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps 48(%r9), %xmm3 @@ -4555,26 +4560,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps 64(%r9), %xmm3 @@ -4588,26 +4593,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps 80(%r9), %xmm3 @@ -4621,26 +4626,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm6 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm7 ; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 96(%r8), %xmm2 ; SSE-NEXT: movaps 96(%r9), %xmm3 @@ -4654,26 +4659,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdx), %xmm7 +; SSE-NEXT: movaps 112(%rdx), %xmm6 ; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm7 ; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 112(%r8), %xmm2 ; SSE-NEXT: movaps 112(%r9), %xmm3 @@ -4687,26 +4692,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm6 ; SSE-NEXT: movaps 128(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 128(%r8), %xmm2 ; SSE-NEXT: movaps 128(%r9), %xmm3 @@ -4720,26 +4725,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdx), %xmm6 ; SSE-NEXT: movaps 144(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps 144(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 144(%r8), %xmm2 ; SSE-NEXT: movaps 144(%r9), %xmm3 @@ -4753,26 +4758,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm6 ; SSE-NEXT: movaps 160(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movaps 160(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 160(%r8), %xmm2 ; SSE-NEXT: movaps 160(%r9), %xmm3 @@ -4786,26 +4791,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdx), %xmm6 ; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps 176(%rdi), %xmm7 ; SSE-NEXT: movaps 176(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 176(%r8), %xmm2 ; SSE-NEXT: movaps 176(%r9), %xmm3 @@ -4819,26 +4824,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm6 ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 192(%r8), %xmm2 ; SSE-NEXT: movaps 192(%r9), %xmm3 @@ -4852,26 +4857,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdx), %xmm6 ; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm7 ; SSE-NEXT: movaps 208(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 208(%r8), %xmm2 ; SSE-NEXT: movaps 208(%r9), %xmm3 @@ -4885,84 +4890,84 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movaps 224(%r8), %xmm2 -; SSE-NEXT: movaps 224(%r9), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: movaps 224(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps 240(%rdx), %xmm3 +; SSE-NEXT: movaps 240(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 240(%rsi), %xmm10 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 240(%r8), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps 240(%r8), %xmm1 ; SSE-NEXT: movaps 240(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 1520(%rax) -; SSE-NEXT: movaps %xmm14, 1504(%rax) -; SSE-NEXT: movaps %xmm0, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1472(%rax) +; SSE-NEXT: movaps %xmm3, 1520(%rax) +; SSE-NEXT: movaps %xmm10, 1504(%rax) +; SSE-NEXT: movaps %xmm2, 1488(%rax) +; SSE-NEXT: movaps %xmm5, 1472(%rax) ; SSE-NEXT: movaps %xmm6, 1456(%rax) -; SSE-NEXT: movaps %xmm5, 1440(%rax) -; SSE-NEXT: movaps %xmm11, 1424(%rax) -; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm10, 1392(%rax) -; SSE-NEXT: movaps %xmm12, 1376(%rax) +; SSE-NEXT: movaps %xmm4, 1440(%rax) +; SSE-NEXT: movaps %xmm9, 1424(%rax) +; SSE-NEXT: movaps %xmm8, 1408(%rax) +; SSE-NEXT: movaps %xmm11, 1392(%rax) +; SSE-NEXT: movaps %xmm14, 1376(%rax) ; SSE-NEXT: movaps %xmm15, 1360(%rax) ; SSE-NEXT: movaps %xmm13, 1344(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5138,7 +5143,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2392, %rsp # imm = 0x958 +; AVX1-ONLY-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 @@ -5146,7 +5151,6 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 @@ -5176,6 +5180,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5199,17 +5205,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5219,11 +5225,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 @@ -5239,17 +5245,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5268,8 +5274,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5278,17 +5285,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5337,11 +5344,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 144(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 @@ -5355,17 +5362,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 164(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5375,11 +5382,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 176(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 @@ -5393,15 +5400,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 196(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5432,168 +5441,167 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r9), %ymm1 @@ -5605,8 +5613,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 @@ -5622,15 +5630,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5645,8 +5652,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 @@ -5662,8 +5669,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5680,18 +5687,18 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -5699,73 +5706,98 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm12 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] @@ -5773,115 +5805,91 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1408(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1312(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 1216(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1120(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1024(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1504(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1312(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1120(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1024(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 832(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5950,13 +5958,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $2392, %rsp # imm = 0x958 +; AVX1-ONLY-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX2-SLOW-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 @@ -6152,11 +6160,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero @@ -6186,11 +6194,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6217,11 +6225,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6248,11 +6256,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6279,11 +6287,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6310,11 +6318,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6341,11 +6349,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 192(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6601,7 +6609,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 160(%r8), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -6616,8 +6624,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] @@ -6647,12 +6655,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -6672,8 +6680,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -6686,13 +6694,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 1440(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 1408(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm12, 1312(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 1248(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 1056(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 1056(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 928(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 864(%rax) @@ -6768,26 +6776,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2312, %rsp # imm = 0x908 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: subq $2376, %rsp # imm = 0x948 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] @@ -6800,22 +6808,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm11 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -6825,650 +6832,658 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm8 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm9[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm14 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm14[0],zero,xmm14[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm5 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vbroadcastss 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 192(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 224(%rcx), %xmm0 -; AVX2-FAST-NEXT: vbroadcastss 224(%rdx), %xmm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rdx), %xmm4 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 244(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 244(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[2,3],ymm8[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 1504(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 1504(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 1216(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 1056(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 1024(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 928(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 1216(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 1120(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 1056(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 1024(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 928(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 864(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7545,13 +7560,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX2-FAST-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX2-FAST-PERLANE-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 @@ -7747,11 +7762,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero @@ -7781,11 +7796,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7812,11 +7827,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7843,11 +7858,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7874,11 +7889,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7905,11 +7920,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7936,11 +7951,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -8196,7 +8211,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8211,8 +8226,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] @@ -8242,12 +8257,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -8267,8 +8282,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -8281,13 +8296,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1440(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1408(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 1312(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 1248(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 1056(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 1056(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 928(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 864(%rax) @@ -8363,1153 +8378,1145 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm12 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm21, %zmm25 +; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm22 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm9, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm10, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm28, %zmm27 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm31, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm28, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm31, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm28, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm31, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm28 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm31, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512F-SLOW-NEXT: movb $36, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm13[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm21, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm31[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vpermt2d 192(%rcx), %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm21 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm14, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm14, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm14, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm15 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512F-SLOW-NEXT: movb $-110, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm8[2],zmm11[2],zmm8[3],zmm11[3],zmm8[6],zmm11[6],zmm8[7],zmm11[7],zmm8[10],zmm11[10],zmm8[11],zmm11[11],zmm8[14],zmm11[14],zmm8[15],zmm11[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm29 = zmm29[2],zmm4[2],zmm29[3],zmm4[3],zmm29[6],zmm4[6],zmm29[7],zmm4[7],zmm29[10],zmm4[10],zmm29[11],zmm4[11],zmm29[14],zmm4[14],zmm29[15],zmm4[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm17[2],zmm24[3],zmm17[3],zmm24[6],zmm17[6],zmm24[7],zmm17[7],zmm24[10],zmm17[10],zmm24[11],zmm17[11],zmm24[14],zmm17[14],zmm24[15],zmm17[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm17[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm29[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm24[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm10 = zmm20[2],zmm31[2],zmm20[3],zmm31[3],zmm20[6],zmm31[6],zmm20[7],zmm31[7],zmm20[10],zmm31[10],zmm20[11],zmm31[11],zmm20[14],zmm31[14],zmm20[15],zmm31[15] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm10[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm29 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm30 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 1408(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-FAST-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} ; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} ; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 ; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm12 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm21, %zmm25 +; AVX512BW-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm9, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm10, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm28, %zmm27 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm31, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm28, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm31, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm28, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm31, %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm28 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm31, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm31 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm0, %ymm14 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512BW-SLOW-NEXT: movb $36, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm21, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm13[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm21, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm0, %ymm31 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm31[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-SLOW-NEXT: vpermt2d 192(%rcx), %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm21 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm14, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm14, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm14, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm15 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512BW-SLOW-NEXT: movb $-110, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm8[2],zmm11[2],zmm8[3],zmm11[3],zmm8[6],zmm11[6],zmm8[7],zmm11[7],zmm8[10],zmm11[10],zmm8[11],zmm11[11],zmm8[14],zmm11[14],zmm8[15],zmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm2, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm29 = zmm29[2],zmm4[2],zmm29[3],zmm4[3],zmm29[6],zmm4[6],zmm29[7],zmm4[7],zmm29[10],zmm4[10],zmm29[11],zmm4[11],zmm29[14],zmm4[14],zmm29[15],zmm4[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm17[2],zmm24[3],zmm17[3],zmm24[6],zmm17[6],zmm24[7],zmm17[7],zmm24[10],zmm17[10],zmm24[11],zmm17[11],zmm24[14],zmm17[14],zmm24[15],zmm17[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm17[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm29[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm24[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm10 = zmm20[2],zmm31[2],zmm20[3],zmm31[3],zmm20[6],zmm31[6],zmm20[7],zmm31[7],zmm20[10],zmm31[10],zmm20[11],zmm31[11],zmm20[14],zmm31[14],zmm20[15],zmm31[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm10[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm29 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm30 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 1408(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 1180f88f3d118..fe216de5231a3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -617,138 +617,131 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp +; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm1 -; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movdqa (%rax), %xmm10 +; SSE-NEXT: movaps 16(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm10[3,3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,1],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[2,0] -; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[1,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm11[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm1[0],xmm7[1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm5[0],xmm13[1,2,3] +; SSE-NEXT: movaps (%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm14[3,3] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[2,0] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm10[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm2, 64(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm12, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movdqa %xmm5, 176(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm14, 64(%rax) +; SSE-NEXT: movaps %xmm6, 128(%rax) +; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm7, 96(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) +; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $104, %rsp +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: @@ -776,8 +769,8 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1],xmm9[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 @@ -785,19 +778,19 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm14[1],xmm15[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] @@ -807,19 +800,19 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,3],xmm11[3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 @@ -855,18 +848,18 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm8 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm7 ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -876,61 +869,61 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm8 ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] @@ -946,7 +939,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1067,18 +1060,18 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -1088,61 +1081,61 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] @@ -1158,7 +1151,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1280,84 +1273,86 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $536, %rsp # imm = 0x218 +; SSE-NEXT: subq $520, %rsp # imm = 0x208 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm4 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm13 +; SSE-NEXT: movdqa 16(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 @@ -1367,178 +1362,176 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,0] +; SSE-NEXT: movaps 48(%r9), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,1],mem[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm6[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] -; SSE-NEXT: shufps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm6[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm6[0],xmm13[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm6[0],xmm9[1,2,3] -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm4, 400(%rax) -; SSE-NEXT: movaps %xmm3, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm7[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm7[0],xmm12[1,2,3] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm7[0],xmm9[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm7[0],xmm8[1,2,3] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm1, 416(%rax) +; SSE-NEXT: movaps %xmm3, 400(%rax) +; SSE-NEXT: movaps %xmm4, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm8, 240(%rax) -; SSE-NEXT: movdqa %xmm11, 224(%rax) +; SSE-NEXT: movaps %xmm6, 240(%rax) +; SSE-NEXT: movdqa %xmm15, 224(%rax) ; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm13, 112(%rax) +; SSE-NEXT: movaps %xmm14, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1547,31 +1540,32 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm9, 320(%rax) -; SSE-NEXT: movaps %xmm13, 304(%rax) +; SSE-NEXT: movaps %xmm8, 320(%rax) +; SSE-NEXT: movaps %xmm9, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm14, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm12, 208(%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $536, %rsp # imm = 0x218 +; SSE-NEXT: addq $520, %rsp # imm = 0x208 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 @@ -1581,12 +1575,11 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] @@ -1596,16 +1589,16 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm4[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 @@ -1616,7 +1609,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1624,21 +1617,23 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,1],xmm10[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1],xmm11[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1],xmm4[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm15[1],xmm14[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm8[1,1],ymm6[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm7[1,1],ymm13[5,5],ymm7[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] @@ -1648,20 +1643,22 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1],xmm14[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1669,73 +1666,71 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm5[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm6[1,1],ymm7[5,5],ymm6[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm2[2,1],ymm4[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm15[2,1],ymm2[6,4],ymm15[6,5] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1],ymm3[0,2],ymm1[5,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[0,2],ymm1[5,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3],ymm11[3,3],ymm13[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm6[3,3],ymm11[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,3],ymm8[3,3],ymm10[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm1[3,3],ymm15[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3],ymm1[1,2],ymm5[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1743,42 +1738,43 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm14[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,1],ymm3[0,2],ymm13[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm5[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm6[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1],ymm4[0,2],ymm6[7,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm14[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[3,3],xmm14[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] @@ -1786,12 +1782,10 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1808,104 +1802,104 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm6 +; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm8 +; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm14 -; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -1915,9 +1909,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -1925,125 +1919,125 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm12, %ymm5 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2060,7 +2054,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2068,52 +2062,52 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $536, %rsp # imm = 0x218 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm9[1],zero +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm11[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm2[1],xmm11[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm8[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 @@ -2135,20 +2129,20 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm12 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 @@ -2156,7 +2150,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2168,132 +2162,133 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm7, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm6 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm15, %ymm5 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm10 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm9 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm4 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm4 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm6, %ymm4 ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[3,3],ymm7[3,3],ymm2[7,7],ymm7[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm14[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm10[3,3],ymm2[7,7],ymm10[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm2[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm15[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2320,98 +2315,98 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -2421,9 +2416,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -2431,125 +2426,125 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2566,7 +2561,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2824,71 +2819,70 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -2898,23 +2892,22 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm13 +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -2924,24 +2917,26 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%r9), %xmm1 @@ -2951,17 +2946,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm12 +; SSE-NEXT: movdqa 80(%rsi), %xmm3 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2979,37 +2974,38 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movdqa 96(%rsi), %xmm6 +; SSE-NEXT: movaps 96(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rax), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 96(%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -3021,225 +3017,227 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps 112(%rcx), %xmm2 ; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movaps 112(%r9), %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps 112(%rax), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[0,3] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[0,3] -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,3],mem[3,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm12[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -3247,12 +3245,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -3289,11 +3286,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -3304,11 +3302,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -3333,19 +3331,19 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 864(%rax) -; SSE-NEXT: movaps %xmm5, 848(%rax) +; SSE-NEXT: movaps %xmm5, 864(%rax) +; SSE-NEXT: movaps %xmm7, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps %xmm4, 784(%rax) -; SSE-NEXT: movaps %xmm7, 736(%rax) -; SSE-NEXT: movaps %xmm8, 688(%rax) -; SSE-NEXT: movaps %xmm9, 672(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm11, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) +; SSE-NEXT: movaps %xmm6, 784(%rax) +; SSE-NEXT: movaps %xmm8, 736(%rax) +; SSE-NEXT: movaps %xmm9, 688(%rax) +; SSE-NEXT: movaps %xmm10, 672(%rax) +; SSE-NEXT: movaps %xmm11, 624(%rax) +; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm15, 560(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3393,13 +3391,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) -; SSE-NEXT: movaps %xmm6, 544(%rax) +; SSE-NEXT: movaps %xmm3, 544(%rax) ; SSE-NEXT: movaps %xmm12, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) -; SSE-NEXT: movaps %xmm15, 432(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3416,8 +3415,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps %xmm4, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3435,27 +3433,27 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 @@ -3472,11 +3470,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -3491,7 +3489,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3508,10 +3506,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3521,18 +3518,18 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -3551,11 +3548,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3565,11 +3562,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3585,25 +3582,26 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3611,34 +3609,34 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm9[1,1],ymm2[5,5],ymm9[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,1],ymm1[6,4],ymm10[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -3646,93 +3644,96 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm7[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm11[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm4[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[0,2],ymm11[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm14 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm15[3,3],ymm8[7,7],ymm15[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 124(%r8), %ymm1 @@ -3742,9 +3743,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -3753,8 +3757,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm8[1,1],ymm15[5,5],ymm8[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -3767,10 +3771,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3781,14 +3785,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm4[3,3],ymm3[7,7],ymm4[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm12[3,3],ymm4[7,7],ymm12[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm5[3,3],ymm11[7,7],ymm5[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3796,138 +3798,139 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6],ymm13[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm13[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[3,3],ymm14[3,3],ymm5[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,3],ymm7[3,3],ymm12[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,3],ymm11[3,3],ymm0[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,3],ymm11[1,2],ymm0[6,7],ymm11[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm13[1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm6[3,3],ymm10[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[3,3],ymm8[3,3],ymm6[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm14[3,3],ymm4[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,1],ymm5[0,2],ymm9[7,5],ymm5[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,1],ymm3[0,2],ymm11[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm5[0,2],ymm6[7,5],ymm5[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm9[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1],ymm7[0,2],ymm9[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1],ymm9[0,2],ymm6[7,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[3,3],xmm13[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm6, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -3963,7 +3966,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3977,42 +3980,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm14 -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm10[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm13 ; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm11[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4034,16 +4037,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm13[1],zero +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -4058,7 +4062,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero @@ -4088,11 +4092,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4139,10 +4143,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] @@ -4151,146 +4154,146 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss 112(%rax), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps %xmm14, %xmm9 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm14, %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -4306,12 +4309,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] @@ -4328,124 +4331,125 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm4[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm2[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm2[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm5[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 608(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 576(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 608(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 384(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) @@ -4489,15 +4493,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1400, %rsp # imm = 0x578 +; AVX2-FAST-NEXT: subq $1416, %rsp # imm = 0x588 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm1 @@ -4505,30 +4509,29 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm6[1],zero +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -4538,8 +4541,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm2[1],xmm3[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm6[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r8), %xmm1 @@ -4560,20 +4563,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm3[1],xmm2[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1 @@ -4587,35 +4590,37 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm3[1],xmm2[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm10 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm8 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 @@ -4637,8 +4642,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 @@ -4647,11 +4652,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4660,51 +4665,51 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm14 +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm12, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -4715,44 +4720,45 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm14, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovaps %xmm11, %xmm15 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm15[3,3],xmm13[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps %xmm5, %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4767,11 +4773,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] @@ -4780,192 +4786,194 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1],ymm11[1,1],ymm3[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[1,1],mem[1,1],ymm0[5,5],mem[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm0[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm6 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm3 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm0[3,3],xmm2[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm6[2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm9[1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm5[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm0[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 416(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 320(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 736(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 672(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 608(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5006,7 +5014,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-NEXT: addq $1400, %rsp # imm = 0x578 +; AVX2-FAST-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5020,42 +5028,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5077,16 +5085,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -5101,7 +5110,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero @@ -5131,11 +5140,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5182,10 +5191,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] @@ -5194,146 +5202,146 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rax), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -5349,12 +5357,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] @@ -5371,124 +5379,125 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm2[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm2[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm5[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 608(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 576(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 608(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) @@ -5534,200 +5543,201 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512F-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512F-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512F-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512F-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512F-NEXT: popq %rax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5736,200 +5746,201 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5960,69 +5971,69 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps 16(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -6032,8 +6043,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6046,8 +6057,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6058,8 +6069,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6073,8 +6084,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 64(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6085,8 +6096,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6111,8 +6122,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6124,11 +6135,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rcx), %xmm10 ; SSE-NEXT: movaps 96(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6138,8 +6148,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6164,8 +6174,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6180,7 +6190,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 128(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6309,21 +6319,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm8 +; SSE-NEXT: movaps 208(%rcx), %xmm6 ; SSE-NEXT: movaps 208(%r8), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rax), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 208(%r9), %xmm6 +; SSE-NEXT: movdqa 208(%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6331,10 +6342,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 @@ -6343,22 +6354,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm3 +; SSE-NEXT: movaps 224(%rcx), %xmm4 ; SSE-NEXT: movaps 224(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps 224(%r9), %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps 224(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6372,21 +6383,21 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: movaps 240(%rcx), %xmm6 ; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movaps 240(%r9), %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps 240(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6395,9 +6406,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6407,139 +6418,187 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[1,3] +; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6553,21 +6612,21 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -6578,17 +6637,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6601,77 +6660,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] @@ -6681,8 +6691,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] @@ -6700,13 +6710,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -6715,178 +6725,197 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm11[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -6932,30 +6961,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -6984,19 +6995,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm4[0],xmm12[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7007,23 +7018,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm11, 1744(%rax) +; SSE-NEXT: movaps %xmm7, 1744(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1728(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1696(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1696(%rax) ; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm6, 1648(%rax) -; SSE-NEXT: movaps %xmm7, 1632(%rax) +; SSE-NEXT: movaps %xmm8, 1648(%rax) +; SSE-NEXT: movaps %xmm9, 1632(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1616(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1584(%rax) -; SSE-NEXT: movaps %xmm9, 1568(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1584(%rax) +; SSE-NEXT: movaps %xmm10, 1568(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1536(%rax) ; SSE-NEXT: movaps %xmm13, 1520(%rax) -; SSE-NEXT: movaps %xmm10, 1472(%rax) +; SSE-NEXT: movaps %xmm12, 1472(%rax) ; SSE-NEXT: movaps %xmm14, 1456(%rax) ; SSE-NEXT: movaps %xmm15, 1408(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7042,101 +7053,101 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1136(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1120(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1072(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1024(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1008(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 960(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 896(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 848(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 736(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 688(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 672(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 624(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1776(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1712(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1664(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1600(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1552(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1504(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm8, 1424(%rax) +; SSE-NEXT: movaps %xmm1, 1072(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1392(%rax) +; SSE-NEXT: movaps %xmm1, 1024(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1376(%rax) -; SSE-NEXT: movaps %xmm12, 1328(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps %xmm1, 1008(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 960(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 912(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 896(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 848(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 800(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 784(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 736(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 688(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 672(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 624(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 576(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 560(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 512(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1280(%rax) +; SSE-NEXT: movaps %xmm1, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1264(%rax) +; SSE-NEXT: movaps %xmm1, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1776(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1712(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1664(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1600(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1552(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1504(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1488(%rax) +; SSE-NEXT: movaps %xmm4, 1440(%rax) +; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1392(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1376(%rax) +; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1280(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1264(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1216(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1200(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1168(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1152(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1168(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1152(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7145,7 +7156,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1056(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1040(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 976(%rax) @@ -7161,7 +7172,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%rax) @@ -7179,7 +7190,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 592(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) -; SSE-NEXT: movaps %xmm2, 528(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7194,8 +7206,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%rax) +; SSE-NEXT: movaps %xmm2, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7221,40 +7232,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7266,18 +7278,18 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -7289,17 +7301,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[1,1],ymm12[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7309,26 +7320,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7336,28 +7347,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7376,16 +7387,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 @@ -7404,10 +7414,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7417,16 +7427,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7436,26 +7447,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7463,12 +7474,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -7488,8 +7499,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7508,7 +7519,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7516,9 +7527,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7526,14 +7537,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7543,11 +7554,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7563,26 +7574,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7590,34 +7599,34 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[2,1],ymm1[6,4],ymm9[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -7626,24 +7635,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7651,193 +7660,194 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm4[1],xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm4[1,1],ymm5[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[2,1],ymm2[6,4],ymm11[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm1[0,2],ymm15[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[0,2],ymm13[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm3[1],ymm11[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[1],xmm4[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[3,3],xmm2[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3],ymm5[3,3],ymm6[7,7],ymm5[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -7849,12 +7859,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm3[3,3],ymm4[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -7864,9 +7874,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 236(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -7875,8 +7885,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm4[1,1],ymm3[5,5],ymm4[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -7889,10 +7899,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7903,13 +7913,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm7[3,3],ymm1[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7942,9 +7951,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7974,11 +7983,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm8[3,3],ymm0[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8008,11 +8018,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm12[3,3],ymm0[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm15[3,3],ymm1[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8041,170 +8051,170 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[3,3],mem[3,3],ymm14[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3],ymm1[3,3],ymm12[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3],ymm11[3,3],ymm10[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm10[3,3],ymm9[7,7],ymm10[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm2[0,2],ymm6[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[3,1],ymm12[0,2],ymm14[7,5],ymm12[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm13[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm5[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm7[3,3],xmm8[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1440(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1216(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 992(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1440(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 992(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8303,42 +8313,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX1-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: subq $2968, %rsp # imm = 0xB98 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -8346,15 +8355,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8376,10 +8385,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8410,9 +8420,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -8510,11 +8520,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8625,11 +8635,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8652,319 +8662,317 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm13 -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovaps %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm14[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 220(%r9), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 216(%rax), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 220(%r9), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 216(%rax), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -8975,33 +8983,33 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9018,9 +9026,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9034,8 +9042,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9090,9 +9098,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9103,44 +9111,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9150,25 +9157,27 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9177,14 +9186,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9194,103 +9202,104 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3,4],ymm8[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = xmm7[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm8, 1440(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 1216(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 1088(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 992(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 864(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 1088(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 992(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 640(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1472(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 1472(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9373,13 +9382,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-SLOW-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: addq $2968, %rsp # imm = 0xB98 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: subq $3080, %rsp # imm = 0xC08 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9569,28 +9578,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm15 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9609,11 +9618,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9632,11 +9641,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9655,11 +9664,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9678,11 +9687,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9702,10 +9711,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm9 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm12 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 @@ -9737,31 +9746,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm2 ; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm5 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] @@ -9774,51 +9783,51 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm14, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm14[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] @@ -9838,25 +9847,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9867,43 +9876,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm11[1,1],mem[1,1],ymm11[5,5],mem[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -9927,37 +9936,37 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9975,43 +9984,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10023,39 +10032,39 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10071,31 +10080,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -10105,9 +10114,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10119,42 +10128,42 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -10168,12 +10177,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10182,178 +10191,175 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm4[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm5[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm5[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0],ymm12[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3,4],ymm5[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3,4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm1[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 1216(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm12, 992(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 864(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 1088(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 992(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 768(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10444,42 +10450,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FAST-NEXT: addq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: subq $2968, %rsp # imm = 0xB98 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -10487,15 +10492,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10517,10 +10522,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10551,9 +10557,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -10651,11 +10657,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10766,11 +10772,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10793,319 +10799,317 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm14[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11116,33 +11120,33 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11159,9 +11163,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11175,8 +11179,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11231,9 +11235,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11244,44 +11248,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11291,24 +11294,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11318,14 +11323,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -11335,103 +11339,104 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3,4],ymm8[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm7[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 1440(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 1216(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1088(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 992(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 864(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1088(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1472(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1472(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11514,943 +11519,959 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: addq $2968, %rsp # imm = 0xB98 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: store_i32_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermt2d %zmm22, %zmm31, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm19, %zmm31, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm13, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm29 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm21 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm23, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm23, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm23, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm6, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm13 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm12, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm29, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512F-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512F-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} ; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm9 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermi2d %zmm7, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm7, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm7, %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vpermi2d %zmm7, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm15, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1408(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1216(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512F-NEXT: addq $3016, %rsp # imm = 0xBC8 +; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm31, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm31, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm23, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm23, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm23, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm12, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm13, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1216(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512BW-NEXT: addq $3016, %rsp # imm = 0xBC8 +; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 69d8fa57cd482..f2b18eb113bbc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -352,133 +352,134 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps (%rdi), %xmm8 ; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm15 +; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps (%r9), %xmm1 ; SSE-NEXT: movaps (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm15 +; SSE-NEXT: movaps 16(%r10), %xmm12 ; SSE-NEXT: movaps (%rax), %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm10 +; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps 16(%r9), %xmm10 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movaps 16(%r9), %xmm7 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm6[0] ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm10[0,2] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm14[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm4[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm5[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm3[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm9[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 224(%rax) -; SSE-NEXT: movaps %xmm7, 240(%rax) -; SSE-NEXT: movaps %xmm3, 160(%rax) +; SSE-NEXT: movaps %xmm10, 240(%rax) +; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm4, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm14, 96(%rax) +; SSE-NEXT: movaps %xmm15, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm10, 192(%rax) -; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf8: @@ -827,9 +828,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps (%r10), %xmm5 -; SSE-NEXT: movaps (%rax), %xmm7 +; SSE-NEXT: movaps (%rax), %xmm6 ; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE-NEXT: movaps %xmm9, %xmm13 @@ -837,10 +838,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -848,7 +849,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -859,17 +860,17 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: movaps 16(%r10), %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm2 @@ -916,109 +917,110 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm0 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps 32(%r8), %xmm9 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm1 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] ; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps 48(%r10), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm13 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movaps 48(%r8), %xmm5 +; SSE-NEXT: movaps 48(%r9), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[2,0] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 496(%rax) -; SSE-NEXT: movaps %xmm4, 480(%rax) -; SSE-NEXT: movaps %xmm13, 464(%rax) +; SSE-NEXT: movaps %xmm5, 496(%rax) +; SSE-NEXT: movaps %xmm3, 480(%rax) +; SSE-NEXT: movaps %xmm9, 464(%rax) ; SSE-NEXT: movaps %xmm12, 448(%rax) -; SSE-NEXT: movaps %xmm8, 432(%rax) -; SSE-NEXT: movaps %xmm5, 416(%rax) +; SSE-NEXT: movaps %xmm6, 432(%rax) +; SSE-NEXT: movaps %xmm4, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: movaps %xmm9, 368(%rax) +; SSE-NEXT: movaps %xmm15, 384(%rax) +; SSE-NEXT: movaps %xmm11, 368(%rax) ; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm11, 304(%rax) -; SSE-NEXT: movaps %xmm15, 288(%rax) +; SSE-NEXT: movaps %xmm13, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1113,17 +1115,17 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] @@ -1134,7 +1136,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] @@ -1145,7 +1147,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -1156,7 +1158,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] @@ -1167,115 +1169,115 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm15[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1302,108 +1304,108 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm13 -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm9[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 @@ -1419,74 +1421,75 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm2[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -1496,8 +1499,8 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -1505,10 +1508,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1520,13 +1523,14 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -1544,7 +1548,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1907,21 +1911,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1929,48 +1933,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1978,48 +1982,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r10), %xmm2 +; SSE-NEXT: movaps 64(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2027,48 +2031,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r10), %xmm2 +; SSE-NEXT: movaps 80(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2076,135 +2080,136 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 96(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 96(%r8), %xmm7 -; SSE-NEXT: movaps 96(%r9), %xmm9 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm1[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps 96(%rdx), %xmm2 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm13 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm15 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps 112(%r8), %xmm3 -; SSE-NEXT: movaps 112(%r9), %xmm14 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movaps 96(%r10), %xmm1 +; SSE-NEXT: movaps 96(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 96(%r8), %xmm13 +; SSE-NEXT: movaps 96(%r9), %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps 112(%r10), %xmm0 +; SSE-NEXT: movaps 112(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps 112(%r8), %xmm4 +; SSE-NEXT: movaps 112(%r9), %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1008(%rax) -; SSE-NEXT: movaps %xmm0, 992(%rax) -; SSE-NEXT: movaps %xmm14, 976(%rax) -; SSE-NEXT: movaps %xmm13, 960(%rax) -; SSE-NEXT: movaps %xmm8, 944(%rax) -; SSE-NEXT: movaps %xmm9, 928(%rax) +; SSE-NEXT: movaps %xmm4, 1008(%rax) +; SSE-NEXT: movaps %xmm2, 992(%rax) +; SSE-NEXT: movaps %xmm9, 976(%rax) +; SSE-NEXT: movaps %xmm11, 960(%rax) +; SSE-NEXT: movaps %xmm6, 944(%rax) +; SSE-NEXT: movaps %xmm3, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 896(%rax) -; SSE-NEXT: movaps %xmm7, 880(%rax) -; SSE-NEXT: movaps %xmm12, 864(%rax) +; SSE-NEXT: movaps %xmm14, 896(%rax) +; SSE-NEXT: movaps %xmm13, 880(%rax) +; SSE-NEXT: movaps %xmm15, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps %xmm11, 816(%rax) -; SSE-NEXT: movaps %xmm15, 800(%rax) +; SSE-NEXT: movaps %xmm12, 816(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2319,53 +2324,53 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm8[1,0],ymm10[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,3],ymm9[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,0],ymm8[1,0],ymm11[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm8[3,0],ymm10[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm8[3,0],ymm11[7,4],ymm8[7,4] ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[2,0],ymm12[2,3],ymm10[6,4],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] @@ -2373,21 +2378,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm9[1,0],ymm11[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm9[1,0],ymm10[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] @@ -2395,126 +2400,126 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm9[3,0],ymm11[7,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm9[3,0],ymm10[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm1[2,3],ymm7[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm5[3,0],ymm7[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -2525,189 +2530,189 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2717,11 +2722,11 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 832(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 608(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 512(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 608(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2784,154 +2789,154 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm7[0],ymm15[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm8 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm10 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[4],ymm5[4],ymm10[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1],ymm8[2,3,4],ymm1[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm8 @@ -2939,37 +2944,37 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 @@ -3081,117 +3086,117 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 832(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 512(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 512(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3250,285 +3255,290 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: movb $-120, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} ; AVX512F-NEXT: movb $34, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3539,285 +3549,290 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: movb $-120, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} ; AVX512BW-NEXT: movb $34, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3945,21 +3960,21 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -3967,48 +3982,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4016,48 +4031,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r10), %xmm2 +; SSE-NEXT: movaps 64(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4065,48 +4080,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r10), %xmm2 +; SSE-NEXT: movaps 80(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4114,48 +4129,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 96(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r10), %xmm2 +; SSE-NEXT: movaps 96(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4163,48 +4178,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm0 -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 112(%rdi), %xmm8 -; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 112(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r10), %xmm2 +; SSE-NEXT: movaps 112(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 112(%r8), %xmm11 ; SSE-NEXT: movaps 112(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4212,48 +4227,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm0 -; SSE-NEXT: movaps 128(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 128(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r10), %xmm1 -; SSE-NEXT: movaps 128(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 128(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 128(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%r10), %xmm2 +; SSE-NEXT: movaps 128(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 128(%r8), %xmm11 ; SSE-NEXT: movaps 128(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4261,48 +4276,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm0 -; SSE-NEXT: movaps 144(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 144(%rdi), %xmm8 -; SSE-NEXT: movaps 144(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r10), %xmm1 -; SSE-NEXT: movaps 144(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 144(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 144(%rdi), %xmm7 +; SSE-NEXT: movaps 144(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%r10), %xmm2 +; SSE-NEXT: movaps 144(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 144(%r8), %xmm11 ; SSE-NEXT: movaps 144(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4310,48 +4325,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm0 -; SSE-NEXT: movaps 160(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r10), %xmm1 -; SSE-NEXT: movaps 160(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 160(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%r10), %xmm2 +; SSE-NEXT: movaps 160(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 160(%r8), %xmm11 ; SSE-NEXT: movaps 160(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4359,48 +4374,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm0 -; SSE-NEXT: movaps 176(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 176(%rdi), %xmm8 -; SSE-NEXT: movaps 176(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r10), %xmm1 -; SSE-NEXT: movaps 176(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 176(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps 176(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%r10), %xmm2 +; SSE-NEXT: movaps 176(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 176(%r8), %xmm11 ; SSE-NEXT: movaps 176(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4408,48 +4423,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdx), %xmm0 -; SSE-NEXT: movaps 192(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 192(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r10), %xmm1 -; SSE-NEXT: movaps 192(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 192(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 192(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%r10), %xmm2 +; SSE-NEXT: movaps 192(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 192(%r8), %xmm11 ; SSE-NEXT: movaps 192(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4457,48 +4472,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdx), %xmm0 -; SSE-NEXT: movaps 208(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm1 -; SSE-NEXT: movaps 208(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 208(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps 208(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%r10), %xmm2 +; SSE-NEXT: movaps 208(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 208(%r8), %xmm11 ; SSE-NEXT: movaps 208(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4506,135 +4521,136 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm0 -; SSE-NEXT: movaps 224(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r10), %xmm1 -; SSE-NEXT: movaps 224(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 224(%r8), %xmm8 +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm12, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r10), %xmm2 +; SSE-NEXT: movaps 224(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 224(%r8), %xmm15 ; SSE-NEXT: movaps 224(%r9), %xmm6 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] ; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm13 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r10), %xmm1 -; SSE-NEXT: movaps 240(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps 240(%r8), %xmm4 -; SSE-NEXT: movaps 240(%r9), %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] +; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps 240(%r10), %xmm0 +; SSE-NEXT: movaps 240(%rax), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps 240(%r8), %xmm5 +; SSE-NEXT: movaps 240(%r9), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm3[2,0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 2032(%rax) -; SSE-NEXT: movaps %xmm0, 2016(%rax) -; SSE-NEXT: movaps %xmm14, 2000(%rax) -; SSE-NEXT: movaps %xmm13, 1984(%rax) -; SSE-NEXT: movaps %xmm9, 1968(%rax) -; SSE-NEXT: movaps %xmm5, 1952(%rax) +; SSE-NEXT: movaps %xmm5, 2032(%rax) +; SSE-NEXT: movaps %xmm1, 2016(%rax) +; SSE-NEXT: movaps %xmm9, 2000(%rax) +; SSE-NEXT: movaps %xmm11, 1984(%rax) +; SSE-NEXT: movaps %xmm6, 1968(%rax) +; SSE-NEXT: movaps %xmm4, 1952(%rax) ; SSE-NEXT: movaps %xmm10, 1936(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1920(%rax) -; SSE-NEXT: movaps %xmm8, 1904(%rax) +; SSE-NEXT: movaps %xmm14, 1920(%rax) +; SSE-NEXT: movaps %xmm15, 1904(%rax) ; SSE-NEXT: movaps %xmm12, 1888(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1872(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1856(%rax) -; SSE-NEXT: movaps %xmm11, 1840(%rax) -; SSE-NEXT: movaps %xmm15, 1824(%rax) +; SSE-NEXT: movaps %xmm13, 1840(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1824(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4879,50 +4895,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm10 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[2,0],ymm12[2,3],ymm8[6,4],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[2,0],ymm12[2,3],ymm9[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm9 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] @@ -4931,21 +4947,21 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[1],ymm8[1],ymm2[4],ymm8[4],ymm2[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] @@ -4953,25 +4969,25 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 @@ -4983,309 +4999,309 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm1[2,3],ymm7[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 @@ -5293,43 +5309,43 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -5337,343 +5353,343 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -5683,11 +5699,11 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 1856(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 1824(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 1792(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1632(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1600(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1568(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1632(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1600(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1568(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1536(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5817,151 +5833,151 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm8 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm8 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm7 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[6],ymm12[6],ymm5[7],ymm12[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm7[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[4],ymm7[4],ymm4[5],ymm7[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm8 @@ -5969,37 +5985,37 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 @@ -6523,8 +6539,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -6536,104 +6552,104 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vmovaps 192(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1856(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1824(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1792(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1632(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1600(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1568(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1536(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1856(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1824(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1792(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1632(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1600(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1568(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1536(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6752,92 +6768,97 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i32_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6152, %rsp # imm = 0x1808 +; AVX512F-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -6845,196 +6866,209 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm27 ; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 @@ -7045,228 +7079,222 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 ; AVX512F-NEXT: movb $-120, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: movb $34, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm12 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm27 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm24 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7283,24 +7311,30 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7323,8 +7357,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7338,33 +7372,33 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1408(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm28, 896(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rax) @@ -7382,98 +7416,103 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $6152, %rsp # imm = 0x1808 +; AVX512F-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6152, %rsp # imm = 0x1808 +; AVX512BW-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -7481,196 +7520,209 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 @@ -7681,228 +7733,222 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 ; AVX512BW-NEXT: movb $-120, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: movb $34, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm24 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7919,24 +7965,30 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7959,8 +8011,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7974,33 +8026,33 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) @@ -8018,7 +8070,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $6152, %rsp # imm = 0x1808 +; AVX512BW-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index c794b0fd83339..a9a213c8a5905 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -407,49 +407,49 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 64(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rsi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movaps 48(%rsi), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -461,62 +461,62 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 208(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 240(%rsi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 496(%rdx) ; SSE-NEXT: movaps %xmm0, 480(%rdx) -; SSE-NEXT: movaps %xmm2, 464(%rdx) -; SSE-NEXT: movaps %xmm4, 448(%rdx) -; SSE-NEXT: movaps %xmm1, 432(%rdx) -; SSE-NEXT: movaps %xmm6, 416(%rdx) -; SSE-NEXT: movaps %xmm7, 400(%rdx) -; SSE-NEXT: movaps %xmm8, 384(%rdx) -; SSE-NEXT: movaps %xmm9, 368(%rdx) +; SSE-NEXT: movaps %xmm1, 464(%rdx) +; SSE-NEXT: movaps %xmm2, 448(%rdx) +; SSE-NEXT: movaps %xmm5, 432(%rdx) +; SSE-NEXT: movaps %xmm7, 416(%rdx) +; SSE-NEXT: movaps %xmm6, 400(%rdx) +; SSE-NEXT: movaps %xmm9, 384(%rdx) +; SSE-NEXT: movaps %xmm8, 368(%rdx) ; SSE-NEXT: movaps %xmm11, 352(%rdx) -; SSE-NEXT: movaps %xmm12, 336(%rdx) -; SSE-NEXT: movaps %xmm13, 320(%rdx) -; SSE-NEXT: movaps %xmm10, 304(%rdx) -; SSE-NEXT: movaps %xmm15, 288(%rdx) -; SSE-NEXT: movaps %xmm14, 272(%rdx) +; SSE-NEXT: movaps %xmm10, 336(%rdx) +; SSE-NEXT: movaps %xmm12, 320(%rdx) +; SSE-NEXT: movaps %xmm13, 304(%rdx) +; SSE-NEXT: movaps %xmm14, 288(%rdx) +; SSE-NEXT: movaps %xmm15, 272(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -645,10 +645,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm10 @@ -672,11 +672,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] @@ -684,11 +684,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] @@ -715,11 +715,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 256(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) @@ -831,178 +831,178 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps 128(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 144(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm2 -; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm2 -; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps 192(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm1 +; SSE-NEXT: movaps 208(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps 256(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps 272(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps 288(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps 304(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm1 +; SSE-NEXT: movaps 304(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps 336(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps 336(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm2 -; SSE-NEXT: movaps 352(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps 352(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm15 ; SSE-NEXT: movaps 368(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 384(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 384(%rdi), %xmm13 ; SSE-NEXT: movaps 384(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 400(%rdi), %xmm11 ; SSE-NEXT: movaps 400(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 416(%rdi), %xmm12 ; SSE-NEXT: movaps 416(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 432(%rdi), %xmm8 ; SSE-NEXT: movaps 432(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdi), %xmm5 -; SSE-NEXT: movaps 448(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 448(%rdi), %xmm6 +; SSE-NEXT: movaps 448(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps 464(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 464(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps 480(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 496(%rdi), %xmm3 -; SSE-NEXT: movaps 496(%rsi), %xmm6 +; SSE-NEXT: movaps 496(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 1008(%rdx) ; SSE-NEXT: movaps %xmm0, 992(%rdx) -; SSE-NEXT: movaps %xmm2, 976(%rdx) -; SSE-NEXT: movaps %xmm4, 960(%rdx) -; SSE-NEXT: movaps %xmm1, 944(%rdx) +; SSE-NEXT: movaps %xmm1, 976(%rdx) +; SSE-NEXT: movaps %xmm2, 960(%rdx) +; SSE-NEXT: movaps %xmm5, 944(%rdx) ; SSE-NEXT: movaps %xmm7, 928(%rdx) -; SSE-NEXT: movaps %xmm5, 912(%rdx) -; SSE-NEXT: movaps %xmm8, 896(%rdx) -; SSE-NEXT: movaps %xmm10, 880(%rdx) -; SSE-NEXT: movaps %xmm11, 864(%rdx) -; SSE-NEXT: movaps %xmm9, 848(%rdx) -; SSE-NEXT: movaps %xmm12, 832(%rdx) -; SSE-NEXT: movaps %xmm14, 816(%rdx) +; SSE-NEXT: movaps %xmm6, 912(%rdx) +; SSE-NEXT: movaps %xmm9, 896(%rdx) +; SSE-NEXT: movaps %xmm8, 880(%rdx) +; SSE-NEXT: movaps %xmm10, 864(%rdx) +; SSE-NEXT: movaps %xmm12, 848(%rdx) +; SSE-NEXT: movaps %xmm14, 832(%rdx) +; SSE-NEXT: movaps %xmm11, 816(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rdx) -; SSE-NEXT: movaps %xmm15, 784(%rdx) +; SSE-NEXT: movaps %xmm13, 784(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rdx) -; SSE-NEXT: movaps %xmm13, 752(%rdx) +; SSE-NEXT: movaps %xmm15, 752(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1304,85 +1304,85 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-LABEL: store_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm8[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm6[0,1,1,3] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm10[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm3[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1 @@ -1410,22 +1410,22 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm1[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 6ae1465d3438e..780c3c55d3d3d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -340,79 +340,78 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 64(%rdi), %xmm4 -; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: movapd 16(%rdi), %xmm1 -; SSE-NEXT: movapd 32(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm5 +; SSE-NEXT: movapd 64(%rdi), %xmm5 +; SSE-NEXT: movapd (%rdi), %xmm1 +; SSE-NEXT: movapd 16(%rdi), %xmm2 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm6 ; SSE-NEXT: movapd 64(%rsi), %xmm9 -; SSE-NEXT: movapd (%rsi), %xmm3 -; SSE-NEXT: movapd 16(%rsi), %xmm6 -; SSE-NEXT: movapd 32(%rsi), %xmm7 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm7 +; SSE-NEXT: movapd 32(%rsi), %xmm11 ; SSE-NEXT: movapd 48(%rsi), %xmm10 ; SSE-NEXT: movapd 64(%rdx), %xmm15 -; SSE-NEXT: movapd (%rdx), %xmm11 +; SSE-NEXT: movapd (%rdx), %xmm0 ; SSE-NEXT: movapd 16(%rdx), %xmm12 ; SSE-NEXT: movapd 32(%rdx), %xmm13 ; SSE-NEXT: movapd 48(%rdx), %xmm14 -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movapd %xmm1, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] ; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm1, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm12 -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm13 +; SSE-NEXT: movapd %xmm3, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: movapd %xmm6, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm14[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: movapd %xmm4, %xmm14 +; SSE-NEXT: movapd %xmm5, %xmm14 ; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm15[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] ; SSE-NEXT: movapd 80(%rdi), %xmm15 -; SSE-NEXT: movapd 80(%rsi), %xmm6 +; SSE-NEXT: movapd 80(%rsi), %xmm7 ; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: movapd 80(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 96(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 96(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 112(%rdi), %xmm2 ; SSE-NEXT: movapd 112(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 112(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 368(%rcx) ; SSE-NEXT: movapd %xmm2, 352(%rcx) -; SSE-NEXT: movapd %xmm3, 336(%rcx) -; SSE-NEXT: movapd %xmm1, 320(%rcx) +; SSE-NEXT: movapd %xmm1, 336(%rcx) +; SSE-NEXT: movapd %xmm3, 320(%rcx) ; SSE-NEXT: movapd %xmm4, 304(%rcx) -; SSE-NEXT: movapd %xmm7, 288(%rcx) -; SSE-NEXT: movapd %xmm6, 272(%rcx) +; SSE-NEXT: movapd %xmm6, 288(%rcx) +; SSE-NEXT: movapd %xmm7, 272(%rcx) ; SSE-NEXT: movapd %xmm15, 256(%rcx) ; SSE-NEXT: movapd %xmm8, 240(%rcx) ; SSE-NEXT: movapd %xmm9, 224(%rcx) @@ -423,8 +422,7 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: movapd %xmm13, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rcx) +; SSE-NEXT: movapd %xmm11, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movapd %xmm12, 96(%rcx) @@ -432,7 +430,8 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movapd %xmm11, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -524,13 +523,13 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] @@ -539,37 +538,37 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] @@ -585,13 +584,13 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) @@ -751,67 +750,67 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 160(%rdi), %xmm1 -; SSE-NEXT: movapd 160(%rsi), %xmm14 +; SSE-NEXT: movapd 160(%rsi), %xmm15 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm14[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 160(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movapd 176(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movapd 176(%rdi), %xmm13 ; SSE-NEXT: movapd 176(%rsi), %xmm12 -; SSE-NEXT: movapd %xmm15, %xmm0 +; SSE-NEXT: movapd %xmm13, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 176(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 192(%rdi), %xmm10 +; SSE-NEXT: movapd 192(%rdi), %xmm9 ; SSE-NEXT: movapd 192(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movapd %xmm9, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] ; SSE-NEXT: movapd 192(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 208(%rdi), %xmm9 -; SSE-NEXT: movapd 208(%rsi), %xmm6 -; SSE-NEXT: movapd %xmm9, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 208(%rsi), %xmm7 +; SSE-NEXT: movapd %xmm10, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: movapd 208(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 224(%rdi), %xmm4 -; SSE-NEXT: movapd 224(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 224(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 224(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 240(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 240(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 752(%rcx) ; SSE-NEXT: movapd %xmm2, 736(%rcx) -; SSE-NEXT: movapd %xmm3, 720(%rcx) -; SSE-NEXT: movapd %xmm1, 704(%rcx) +; SSE-NEXT: movapd %xmm1, 720(%rcx) +; SSE-NEXT: movapd %xmm3, 704(%rcx) ; SSE-NEXT: movapd %xmm4, 688(%rcx) -; SSE-NEXT: movapd %xmm7, 672(%rcx) -; SSE-NEXT: movapd %xmm6, 656(%rcx) -; SSE-NEXT: movapd %xmm9, 640(%rcx) +; SSE-NEXT: movapd %xmm6, 672(%rcx) +; SSE-NEXT: movapd %xmm7, 656(%rcx) +; SSE-NEXT: movapd %xmm10, 640(%rcx) ; SSE-NEXT: movapd %xmm11, 624(%rcx) ; SSE-NEXT: movapd %xmm8, 608(%rcx) -; SSE-NEXT: movapd %xmm10, 592(%rcx) -; SSE-NEXT: movapd %xmm13, 576(%rcx) +; SSE-NEXT: movapd %xmm9, 592(%rcx) +; SSE-NEXT: movapd %xmm14, 576(%rcx) ; SSE-NEXT: movapd %xmm12, 560(%rcx) -; SSE-NEXT: movapd %xmm15, 544(%rcx) +; SSE-NEXT: movapd %xmm13, 544(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 528(%rcx) -; SSE-NEXT: movapd %xmm14, 512(%rcx) +; SSE-NEXT: movapd %xmm15, 512(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -886,62 +885,62 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] @@ -957,48 +956,48 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2],ymm8[3] ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] @@ -1009,32 +1008,32 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm11, 736(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 704(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 640(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm12, 736(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 704(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 640(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 608(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 512(%rcx) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 416(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 320(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rcx) @@ -1043,7 +1042,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1067,96 +1066,96 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $168, %rsp -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[2,1,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm10[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] @@ -1166,21 +1165,21 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] @@ -1188,16 +1187,16 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 704(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 704(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%rcx) @@ -1205,12 +1204,12 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 448(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1582,56 +1581,56 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movapd 432(%rdi), %xmm13 +; SSE-NEXT: movapd 432(%rdi), %xmm14 ; SSE-NEXT: movapd 432(%rsi), %xmm12 -; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 432(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 448(%rdi), %xmm10 +; SSE-NEXT: movapd 448(%rdi), %xmm9 ; SSE-NEXT: movapd 448(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] +; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] ; SSE-NEXT: movapd 448(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 464(%rdi), %xmm9 -; SSE-NEXT: movapd 464(%rsi), %xmm6 -; SSE-NEXT: movapd %xmm9, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movapd 464(%rdi), %xmm10 +; SSE-NEXT: movapd 464(%rsi), %xmm7 +; SSE-NEXT: movapd %xmm10, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: movapd 464(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 480(%rdi), %xmm4 -; SSE-NEXT: movapd 480(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 480(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 480(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 496(%rdi), %xmm2 ; SSE-NEXT: movapd 496(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 496(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 1520(%rcx) ; SSE-NEXT: movapd %xmm2, 1504(%rcx) -; SSE-NEXT: movapd %xmm3, 1488(%rcx) -; SSE-NEXT: movapd %xmm1, 1472(%rcx) +; SSE-NEXT: movapd %xmm1, 1488(%rcx) +; SSE-NEXT: movapd %xmm3, 1472(%rcx) ; SSE-NEXT: movapd %xmm4, 1456(%rcx) -; SSE-NEXT: movapd %xmm7, 1440(%rcx) -; SSE-NEXT: movapd %xmm6, 1424(%rcx) -; SSE-NEXT: movapd %xmm9, 1408(%rcx) +; SSE-NEXT: movapd %xmm6, 1440(%rcx) +; SSE-NEXT: movapd %xmm7, 1424(%rcx) +; SSE-NEXT: movapd %xmm10, 1408(%rcx) ; SSE-NEXT: movapd %xmm11, 1392(%rcx) ; SSE-NEXT: movapd %xmm8, 1376(%rcx) -; SSE-NEXT: movapd %xmm10, 1360(%rcx) -; SSE-NEXT: movapd %xmm14, 1344(%rcx) +; SSE-NEXT: movapd %xmm9, 1360(%rcx) +; SSE-NEXT: movapd %xmm13, 1344(%rcx) ; SSE-NEXT: movapd %xmm12, 1328(%rcx) -; SSE-NEXT: movapd %xmm13, 1312(%rcx) +; SSE-NEXT: movapd %xmm14, 1312(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1296(%rcx) ; SSE-NEXT: movapd %xmm15, 1280(%rcx) @@ -1898,7 +1897,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 @@ -1947,101 +1946,107 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm14 +; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm14 ; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovapd 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 352(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovapd 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm10[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm8[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[1,0,2,2] @@ -2079,7 +2084,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] @@ -2091,30 +2096,25 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],mem[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1504(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1504(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 1472(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) @@ -2124,31 +2124,32 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm3, 1280(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1216(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 1184(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 1184(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1120(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 1088(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 1088(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1024(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 992(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 992(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 928(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 896(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 896(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 832(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 800(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 736(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 704(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 640(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 608(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 608(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 512(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 512(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2163,7 +2164,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) @@ -2407,91 +2408,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1504(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1472(%rcx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1504(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1472(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1344(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1280(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1184(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1088(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1056(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1280(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1248(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1216(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1184(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1152(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index 3c33ef722a2f7..6db64bdc2adac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -199,7 +199,7 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps 16(%rsi), %xmm14 ; SSE-NEXT: movaps 32(%rsi), %xmm11 ; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm4 @@ -207,8 +207,8 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps 48(%rdx), %xmm9 ; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm14 -; SSE-NEXT: movaps 48(%rcx), %xmm15 +; SSE-NEXT: movaps 32(%rcx), %xmm15 +; SSE-NEXT: movaps 48(%rcx), %xmm12 ; SSE-NEXT: movaps %xmm2, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -220,29 +220,29 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] ; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 48(%rsi), %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; SSE-NEXT: movaps 48(%rsi), %xmm12 ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps %xmm9, 240(%r8) ; SSE-NEXT: movaps %xmm6, 192(%r8) ; SSE-NEXT: movaps %xmm11, 208(%r8) ; SSE-NEXT: movaps %xmm3, 160(%r8) ; SSE-NEXT: movaps %xmm7, 176(%r8) -; SSE-NEXT: movaps %xmm14, 128(%r8) -; SSE-NEXT: movaps %xmm12, 144(%r8) +; SSE-NEXT: movaps %xmm15, 128(%r8) +; SSE-NEXT: movaps %xmm14, 144(%r8) ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps %xmm4, 112(%r8) ; SSE-NEXT: movaps %xmm13, 64(%r8) @@ -437,118 +437,118 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm10 +; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm15 -; SSE-NEXT: movaps (%rdx), %xmm10 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: movaps 48(%rdx), %xmm14 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm4 -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps 48(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 48(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps 16(%rcx), %xmm5 +; SSE-NEXT: movaps 32(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdx), %xmm11 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps 64(%rdx), %xmm12 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm10 -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps 96(%rdi), %xmm8 +; SSE-NEXT: movaps 96(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm5 ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 112(%rdi), %xmm2 ; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 112(%rdx), %xmm3 -; SSE-NEXT: movaps 112(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 112(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 496(%r8) -; SSE-NEXT: movaps %xmm0, 480(%r8) -; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps %xmm2, 448(%r8) +; SSE-NEXT: movaps %xmm2, 480(%r8) +; SSE-NEXT: movaps %xmm0, 464(%r8) +; SSE-NEXT: movaps %xmm4, 448(%r8) ; SSE-NEXT: movaps %xmm5, 432(%r8) -; SSE-NEXT: movaps %xmm7, 416(%r8) +; SSE-NEXT: movaps %xmm8, 416(%r8) ; SSE-NEXT: movaps %xmm6, 400(%r8) -; SSE-NEXT: movaps %xmm8, 384(%r8) -; SSE-NEXT: movaps %xmm10, 368(%r8) -; SSE-NEXT: movaps %xmm9, 352(%r8) -; SSE-NEXT: movaps %xmm12, 336(%r8) +; SSE-NEXT: movaps %xmm9, 384(%r8) +; SSE-NEXT: movaps %xmm7, 368(%r8) +; SSE-NEXT: movaps %xmm11, 352(%r8) +; SSE-NEXT: movaps %xmm10, 336(%r8) ; SSE-NEXT: movaps %xmm13, 320(%r8) -; SSE-NEXT: movaps %xmm11, 304(%r8) -; SSE-NEXT: movaps %xmm15, 288(%r8) -; SSE-NEXT: movaps %xmm14, 272(%r8) +; SSE-NEXT: movaps %xmm12, 304(%r8) +; SSE-NEXT: movaps %xmm14, 288(%r8) +; SSE-NEXT: movaps %xmm15, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -641,35 +641,35 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm15[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r8) @@ -678,10 +678,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 288(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 272(%r8) @@ -710,86 +710,86 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm3, %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm8, %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm12, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -969,9 +969,9 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm2 @@ -1100,73 +1100,73 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm13 +; SSE-NEXT: movaps 176(%rdx), %xmm14 ; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm12 ; SSE-NEXT: movaps 192(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdx), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdx), %xmm15 ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm10 ; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdx), %xmm7 ; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdx), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm4 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdx), %xmm3 -; SSE-NEXT: movaps 240(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 240(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 1008(%r8) -; SSE-NEXT: movaps %xmm0, 992(%r8) -; SSE-NEXT: movaps %xmm1, 976(%r8) -; SSE-NEXT: movaps %xmm2, 960(%r8) -; SSE-NEXT: movaps %xmm6, 944(%r8) -; SSE-NEXT: movaps %xmm5, 928(%r8) -; SSE-NEXT: movaps %xmm7, 912(%r8) -; SSE-NEXT: movaps %xmm8, 896(%r8) -; SSE-NEXT: movaps %xmm10, 880(%r8) -; SSE-NEXT: movaps %xmm9, 864(%r8) +; SSE-NEXT: movaps %xmm2, 992(%r8) +; SSE-NEXT: movaps %xmm0, 976(%r8) +; SSE-NEXT: movaps %xmm5, 960(%r8) +; SSE-NEXT: movaps %xmm4, 944(%r8) +; SSE-NEXT: movaps %xmm8, 928(%r8) +; SSE-NEXT: movaps %xmm6, 912(%r8) +; SSE-NEXT: movaps %xmm9, 896(%r8) +; SSE-NEXT: movaps %xmm7, 880(%r8) +; SSE-NEXT: movaps %xmm10, 864(%r8) ; SSE-NEXT: movaps %xmm11, 848(%r8) -; SSE-NEXT: movaps %xmm12, 832(%r8) -; SSE-NEXT: movaps %xmm14, 816(%r8) -; SSE-NEXT: movaps %xmm15, 800(%r8) +; SSE-NEXT: movaps %xmm13, 832(%r8) +; SSE-NEXT: movaps %xmm15, 816(%r8) +; SSE-NEXT: movaps %xmm12, 800(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r8) -; SSE-NEXT: movaps %xmm13, 752(%r8) +; SSE-NEXT: movaps %xmm14, 752(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1416,10 +1416,10 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] @@ -1620,56 +1620,56 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm11[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm14 @@ -1682,15 +1682,15 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovaps %ymm11, 992(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 864(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 832(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 736(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 704(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 608(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 576(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 736(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) @@ -2368,62 +2368,62 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdi), %xmm13 +; SSE-NEXT: movaps 448(%rdi), %xmm12 ; SSE-NEXT: movaps 448(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdx), %xmm12 -; SSE-NEXT: movaps 448(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 448(%rdx), %xmm11 +; SSE-NEXT: movaps 448(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm13 ; SSE-NEXT: movaps 464(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm10 -; SSE-NEXT: movaps 464(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps 480(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdx), %xmm7 +; SSE-NEXT: movaps 464(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps 480(%rdi), %xmm8 +; SSE-NEXT: movaps 480(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 480(%rdx), %xmm5 ; SSE-NEXT: movaps 480(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 496(%rdi), %xmm2 ; SSE-NEXT: movaps 496(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 496(%rdx), %xmm3 -; SSE-NEXT: movaps 496(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 496(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 2032(%r8) -; SSE-NEXT: movaps %xmm0, 2016(%r8) -; SSE-NEXT: movaps %xmm1, 2000(%r8) -; SSE-NEXT: movaps %xmm2, 1984(%r8) +; SSE-NEXT: movaps %xmm2, 2016(%r8) +; SSE-NEXT: movaps %xmm0, 2000(%r8) +; SSE-NEXT: movaps %xmm4, 1984(%r8) ; SSE-NEXT: movaps %xmm5, 1968(%r8) -; SSE-NEXT: movaps %xmm7, 1952(%r8) +; SSE-NEXT: movaps %xmm8, 1952(%r8) ; SSE-NEXT: movaps %xmm6, 1936(%r8) ; SSE-NEXT: movaps %xmm9, 1920(%r8) -; SSE-NEXT: movaps %xmm10, 1904(%r8) -; SSE-NEXT: movaps %xmm8, 1888(%r8) -; SSE-NEXT: movaps %xmm11, 1872(%r8) +; SSE-NEXT: movaps %xmm7, 1904(%r8) +; SSE-NEXT: movaps %xmm13, 1888(%r8) +; SSE-NEXT: movaps %xmm10, 1872(%r8) ; SSE-NEXT: movaps %xmm14, 1856(%r8) -; SSE-NEXT: movaps %xmm12, 1840(%r8) -; SSE-NEXT: movaps %xmm13, 1824(%r8) +; SSE-NEXT: movaps %xmm11, 1840(%r8) +; SSE-NEXT: movaps %xmm12, 1824(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2998,10 +2998,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] @@ -3458,78 +3458,78 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm14[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2016(%r8) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2016(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1984(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1888(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1888(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1856(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1760(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1728(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1632(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1600(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1504(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1472(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1376(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1344(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1728(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1632(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1600(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1504(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1472(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1376(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1344(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3640,583 +3640,591 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride4_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm16 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512F-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm31 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm28 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,u,u,1,9,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,10,u,u,3,11,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm10 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, 1984(%r8) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1856(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1792(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1728(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1664(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1600(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm12, 1536(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1472(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 1408(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1344(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1280(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1216(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1152(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1088(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1024(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1856(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 1536(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1408(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1344(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 1280(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1152(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1088(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1024(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 704(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm24, 640(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) -; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride4_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm31 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm28 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,u,u,1,9,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,10,u,u,3,11,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm10 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1984(%r8) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1856(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1792(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1728(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1600(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1536(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1472(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1344(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1280(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1024(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1856(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1344(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 1280(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1152(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1088(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 704(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 640(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index 9f7c408e2a6bf..c0e1a17243a7e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -109,38 +109,38 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm2 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm7 -; SSE-NEXT: movaps (%r8), %xmm8 -; SSE-NEXT: movaps 16(%r8), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps 16(%r8), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movaps %xmm4, 16(%r9) -; SSE-NEXT: movaps %xmm8, 32(%r9) -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps %xmm6, 64(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm5, 16(%r9) +; SSE-NEXT: movaps %xmm9, 32(%r9) +; SSE-NEXT: movaps %xmm6, 48(%r9) +; SSE-NEXT: movaps %xmm7, 64(%r9) ; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) +; SSE-NEXT: movaps %xmm1, 96(%r9) +; SSE-NEXT: movaps %xmm3, 112(%r9) ; SSE-NEXT: movaps %xmm10, 128(%r9) -; SSE-NEXT: movaps %xmm7, 144(%r9) +; SSE-NEXT: movaps %xmm8, 144(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf4: @@ -679,51 +679,51 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 80(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rsi), %xmm12 +; SSE-NEXT: movapd 80(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 80(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 80(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 80(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 96(%rdi), %xmm11 -; SSE-NEXT: movapd 96(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 96(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 96(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 96(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 96(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 96(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 112(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 96(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 112(%rdi), %xmm4 ; SSE-NEXT: movapd 112(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 112(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 112(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 112(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 112(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 624(%r9) ; SSE-NEXT: movapd %xmm2, 608(%r9) -; SSE-NEXT: movapd %xmm3, 592(%r9) +; SSE-NEXT: movapd %xmm4, 592(%r9) ; SSE-NEXT: movapd %xmm1, 576(%r9) ; SSE-NEXT: movapd %xmm5, 560(%r9) -; SSE-NEXT: movapd %xmm4, 544(%r9) -; SSE-NEXT: movapd %xmm7, 528(%r9) +; SSE-NEXT: movapd %xmm3, 544(%r9) +; SSE-NEXT: movapd %xmm8, 528(%r9) ; SSE-NEXT: movapd %xmm11, 512(%r9) ; SSE-NEXT: movapd %xmm6, 496(%r9) -; SSE-NEXT: movapd %xmm13, 480(%r9) +; SSE-NEXT: movapd %xmm12, 480(%r9) ; SSE-NEXT: movapd %xmm9, 464(%r9) -; SSE-NEXT: movapd %xmm12, 448(%r9) +; SSE-NEXT: movapd %xmm13, 448(%r9) ; SSE-NEXT: movapd %xmm14, 432(%r9) ; SSE-NEXT: movapd %xmm10, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -783,153 +783,149 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm9[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0],ymm14[1,2,3] +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm8[0],ymm0[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm2[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 576(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 608(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 608(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -943,159 +939,158 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 576(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm15 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 576(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 512(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 448(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 608(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 608(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r9) ; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1524,51 +1519,51 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 208(%rdi), %xmm14 -; SSE-NEXT: movapd 208(%rsi), %xmm12 +; SSE-NEXT: movapd 208(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 208(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 208(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 208(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 224(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 224(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 224(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 224(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 224(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 224(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 240(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 224(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 240(%rdi), %xmm4 ; SSE-NEXT: movapd 240(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 240(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 240(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 240(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 240(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 1264(%r9) ; SSE-NEXT: movapd %xmm2, 1248(%r9) -; SSE-NEXT: movapd %xmm3, 1232(%r9) +; SSE-NEXT: movapd %xmm4, 1232(%r9) ; SSE-NEXT: movapd %xmm1, 1216(%r9) ; SSE-NEXT: movapd %xmm5, 1200(%r9) -; SSE-NEXT: movapd %xmm4, 1184(%r9) -; SSE-NEXT: movapd %xmm7, 1168(%r9) +; SSE-NEXT: movapd %xmm3, 1184(%r9) +; SSE-NEXT: movapd %xmm8, 1168(%r9) ; SSE-NEXT: movapd %xmm11, 1152(%r9) ; SSE-NEXT: movapd %xmm6, 1136(%r9) -; SSE-NEXT: movapd %xmm13, 1120(%r9) +; SSE-NEXT: movapd %xmm12, 1120(%r9) ; SSE-NEXT: movapd %xmm9, 1104(%r9) -; SSE-NEXT: movapd %xmm12, 1088(%r9) +; SSE-NEXT: movapd %xmm13, 1088(%r9) ; SSE-NEXT: movapd %xmm14, 1072(%r9) ; SSE-NEXT: movapd %xmm10, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1707,17 +1702,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -1726,26 +1721,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -1758,49 +1752,49 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1811,205 +1805,205 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[0],ymm9[0],ymm13[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm15[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, (%rsp), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm13[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm8[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm14[2],ymm11[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm5[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm8[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 976(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 640(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2030,9 +2024,9 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r9) @@ -2074,69 +2068,69 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 @@ -2145,11 +2139,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -2209,188 +2203,186 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm5[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm5[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm7[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm10[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm4 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1152(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1056(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1056(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1024(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 992(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 992(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 896(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 864(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 832(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 896(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 864(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 832(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 736(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 736(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 704(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 576(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2407,7 +2399,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r9) @@ -2421,14 +2413,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1088(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1088(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1248(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1248(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2442,201 +2434,201 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-LABEL: store_i64_stride5_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2644,201 +2636,201 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i64_stride5_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3325,51 +3317,51 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 464(%rdi), %xmm14 -; SSE-NEXT: movapd 464(%rsi), %xmm12 +; SSE-NEXT: movapd 464(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 464(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 464(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 464(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 480(%rdi), %xmm11 -; SSE-NEXT: movapd 480(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 480(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 480(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 480(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 480(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 480(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 496(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 480(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 496(%rdi), %xmm4 ; SSE-NEXT: movapd 496(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 496(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 496(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 496(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 496(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 2544(%r9) ; SSE-NEXT: movapd %xmm2, 2528(%r9) -; SSE-NEXT: movapd %xmm3, 2512(%r9) +; SSE-NEXT: movapd %xmm4, 2512(%r9) ; SSE-NEXT: movapd %xmm1, 2496(%r9) ; SSE-NEXT: movapd %xmm5, 2480(%r9) -; SSE-NEXT: movapd %xmm4, 2464(%r9) -; SSE-NEXT: movapd %xmm7, 2448(%r9) +; SSE-NEXT: movapd %xmm3, 2464(%r9) +; SSE-NEXT: movapd %xmm8, 2448(%r9) ; SSE-NEXT: movapd %xmm11, 2432(%r9) ; SSE-NEXT: movapd %xmm6, 2416(%r9) -; SSE-NEXT: movapd %xmm13, 2400(%r9) +; SSE-NEXT: movapd %xmm12, 2400(%r9) ; SSE-NEXT: movapd %xmm9, 2384(%r9) -; SSE-NEXT: movapd %xmm12, 2368(%r9) +; SSE-NEXT: movapd %xmm13, 2368(%r9) ; SSE-NEXT: movapd %xmm14, 2352(%r9) ; SSE-NEXT: movapd %xmm10, 2336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3668,17 +3660,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $2264, %rsp # imm = 0x8D8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] @@ -3687,11 +3679,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3701,16 +3692,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -3730,18 +3721,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 288(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3757,7 +3746,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3780,50 +3769,50 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm13 ; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3842,52 +3831,52 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3906,106 +3895,107 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm15[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm9[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm8[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -4031,12 +4021,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm7[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] @@ -4057,9 +4047,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 @@ -4071,13 +4061,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm4 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] @@ -4090,8 +4080,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 @@ -4159,9 +4148,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 @@ -4173,24 +4162,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 @@ -4201,16 +4191,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm10[1,2,3] +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -4227,105 +4218,104 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1936(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 2080(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1600(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1440(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1120(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) @@ -4469,13 +4459,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 +; AVX1-ONLY-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2760, %rsp # imm = 0xAC8 +; AVX2-ONLY-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 @@ -4773,7 +4763,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 @@ -4811,71 +4801,71 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm13[1],ymm6[1],ymm13[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4883,18 +4873,18 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -4904,11 +4894,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 @@ -4920,12 +4910,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -4956,7 +4946,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 @@ -4975,130 +4965,130 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm15[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 2432(%r9) @@ -5115,13 +5105,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1984(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1952(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1984(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1952(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1856(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1856(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1824(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5182,7 +5171,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) @@ -5190,7 +5179,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r9) @@ -5214,7 +5203,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2368(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2048(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5229,7 +5219,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2528(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2528(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5244,447 +5234,444 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX2-ONLY-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm29 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm28 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm21 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm11 ; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm19 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm15 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $8, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $8, %al +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm3, 2496(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 2432(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 2368(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm5, 2496(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 2432(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 2368(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 2240(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 2240(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 2112(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1984(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1920(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1856(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1792(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1728(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1664(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1600(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1536(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1472(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1408(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1344(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1280(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1984(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1856(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1792(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1728(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 1664(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1536(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1472(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1408(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1344(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1280(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1088(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1024(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 896(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 832(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%r9) @@ -5692,7 +5679,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5701,7 +5688,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -5710,447 +5697,444 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm28 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm21 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm19 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $8, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $8, %al +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 2496(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 2432(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 2368(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 2432(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 2368(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 2240(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 2240(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2112(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1984(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1920(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1856(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1792(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1728(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1664(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1600(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1536(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1472(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1408(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1344(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1280(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1984(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1792(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1728(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1664(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1472(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1408(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1344(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1280(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1088(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%r9) @@ -6158,7 +6142,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6167,7 +6151,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -6176,7 +6160,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index 71ebeb1d24afa..f5cd64c698d5b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -310,98 +310,99 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps (%rsi), %xmm9 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm12 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps 16(%r8), %xmm9 ; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm15 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm12 +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm5 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 368(%rax) -; SSE-NEXT: movaps %xmm2, 352(%rax) -; SSE-NEXT: movaps %xmm5, 336(%rax) -; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm1, 304(%rax) -; SSE-NEXT: movaps %xmm6, 288(%rax) -; SSE-NEXT: movaps %xmm12, 272(%rax) +; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps %xmm6, 336(%rax) +; SSE-NEXT: movaps %xmm2, 320(%rax) +; SSE-NEXT: movaps %xmm4, 304(%rax) +; SSE-NEXT: movaps %xmm7, 288(%rax) +; SSE-NEXT: movaps %xmm5, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps %xmm7, 224(%rax) -; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movaps %xmm8, 224(%rax) +; SSE-NEXT: movaps %xmm14, 208(%rax) ; SSE-NEXT: movaps %xmm15, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps %xmm9, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps %xmm14, 128(%rax) -; SSE-NEXT: movaps %xmm13, 112(%rax) +; SSE-NEXT: movaps %xmm13, 128(%rax) +; SSE-NEXT: movaps %xmm12, 112(%rax) ; SSE-NEXT: movaps %xmm11, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -410,7 +411,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $24, %rsp @@ -418,9 +420,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -433,38 +435,38 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm14[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm11[0],ymm13[0],ymm11[2],ymm13[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[0],ymm14[0],ymm11[2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 @@ -485,8 +487,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm14[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] @@ -495,13 +497,13 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -513,82 +515,82 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-LABEL: store_i64_stride6_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm4[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm11[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm3[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -791,47 +793,47 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps 32(%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm10 ; SSE-NEXT: movaps 16(%rdx), %xmm11 ; SSE-NEXT: movaps 32(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm2 ; SSE-NEXT: movaps (%r8), %xmm13 ; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps 16(%r9), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -899,63 +901,63 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm11 +; SSE-NEXT: movaps 80(%rdx), %xmm12 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 80(%r8), %xmm13 -; SSE-NEXT: movaps 80(%r9), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdx), %xmm9 -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps 80(%r8), %xmm14 +; SSE-NEXT: movaps 80(%r9), %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 96(%r8), %xmm7 -; SSE-NEXT: movaps 96(%r9), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps 112(%rdx), %xmm4 -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps 112(%r8), %xmm2 -; SSE-NEXT: movaps 112(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 96(%rdx), %xmm10 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 96(%r8), %xmm5 +; SSE-NEXT: movaps 96(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps 112(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 752(%rax) -; SSE-NEXT: movaps %xmm4, 736(%rax) -; SSE-NEXT: movaps %xmm1, 720(%rax) -; SSE-NEXT: movaps %xmm0, 704(%rax) -; SSE-NEXT: movaps %xmm3, 688(%rax) -; SSE-NEXT: movaps %xmm6, 672(%rax) -; SSE-NEXT: movaps %xmm7, 656(%rax) -; SSE-NEXT: movaps %xmm9, 640(%rax) -; SSE-NEXT: movaps %xmm12, 624(%rax) +; SSE-NEXT: movaps %xmm0, 752(%rax) +; SSE-NEXT: movaps %xmm1, 736(%rax) +; SSE-NEXT: movaps %xmm6, 720(%rax) +; SSE-NEXT: movaps %xmm2, 704(%rax) +; SSE-NEXT: movaps %xmm4, 688(%rax) +; SSE-NEXT: movaps %xmm7, 672(%rax) +; SSE-NEXT: movaps %xmm5, 656(%rax) +; SSE-NEXT: movaps %xmm10, 640(%rax) +; SSE-NEXT: movaps %xmm9, 624(%rax) ; SSE-NEXT: movaps %xmm8, 608(%rax) -; SSE-NEXT: movaps %xmm10, 592(%rax) -; SSE-NEXT: movaps %xmm14, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) -; SSE-NEXT: movaps %xmm11, 544(%rax) +; SSE-NEXT: movaps %xmm11, 592(%rax) +; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm14, 560(%rax) +; SSE-NEXT: movaps %xmm12, 544(%rax) ; SSE-NEXT: movaps %xmm15, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%rax) @@ -1028,8 +1030,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm13 +; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 @@ -1039,10 +1041,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 @@ -1065,153 +1067,153 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm11[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm11[0],ymm1[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 592(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 384(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 704(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -1219,13 +1221,13 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 608(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1238,7 +1240,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1257,24 +1259,24 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm15[1],xmm3[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1286,69 +1288,69 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm9[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm2[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[0,1],ymm9[0,1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm10[0,1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm0[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] @@ -1357,8 +1359,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] @@ -1366,57 +1368,57 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm11, 736(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 512(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 512(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) @@ -2593,8 +2595,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm13 ; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps 16(%r9), %xmm5 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2606,9 +2608,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] @@ -2621,9 +2623,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -2859,63 +2861,63 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm13 -; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 208(%r8), %xmm10 -; SSE-NEXT: movaps 208(%r9), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 208(%rdx), %xmm12 +; SSE-NEXT: movaps 208(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 208(%r8), %xmm11 +; SSE-NEXT: movaps 208(%r9), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm13 ; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 224(%r8), %xmm6 -; SSE-NEXT: movaps 224(%r9), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps 224(%r8), %xmm5 +; SSE-NEXT: movaps 224(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 240(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 240(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps 240(%rdx), %xmm4 -; SSE-NEXT: movaps 240(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps 240(%r8), %xmm2 -; SSE-NEXT: movaps 240(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 240(%rdx), %xmm1 +; SSE-NEXT: movaps 240(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 240(%r8), %xmm0 +; SSE-NEXT: movaps 240(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 1520(%rax) -; SSE-NEXT: movaps %xmm4, 1504(%rax) -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm0, 1472(%rax) -; SSE-NEXT: movaps %xmm3, 1456(%rax) +; SSE-NEXT: movaps %xmm0, 1520(%rax) +; SSE-NEXT: movaps %xmm1, 1504(%rax) +; SSE-NEXT: movaps %xmm6, 1488(%rax) +; SSE-NEXT: movaps %xmm2, 1472(%rax) +; SSE-NEXT: movaps %xmm4, 1456(%rax) ; SSE-NEXT: movaps %xmm7, 1440(%rax) -; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps %xmm5, 1424(%rax) ; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm12, 1392(%rax) +; SSE-NEXT: movaps %xmm13, 1392(%rax) ; SSE-NEXT: movaps %xmm8, 1376(%rax) -; SSE-NEXT: movaps %xmm11, 1360(%rax) +; SSE-NEXT: movaps %xmm10, 1360(%rax) ; SSE-NEXT: movaps %xmm14, 1344(%rax) -; SSE-NEXT: movaps %xmm10, 1328(%rax) -; SSE-NEXT: movaps %xmm13, 1312(%rax) +; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps %xmm12, 1312(%rax) ; SSE-NEXT: movaps %xmm15, 1296(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1280(%rax) @@ -3084,340 +3086,342 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1592, %rsp # imm = 0x638 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: subq $1608, %rsp # imm = 0x648 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm13[0],ymm12[0],ymm13[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm4[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm0[0],mem[0] @@ -3466,8 +3470,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm11, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 768(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3548,7 +3552,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1592, %rsp # imm = 0x638 +; AVX1-ONLY-NEXT: addq $1608, %rsp # imm = 0x648 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3812,7 +3816,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 @@ -3829,106 +3833,106 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 144(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1472(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1440(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1472(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1440(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1312(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1280(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1248(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1280(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1120(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1088(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1056(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1120(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1088(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1056(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 928(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 896(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 896(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 736(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 544(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3942,7 +3946,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3988,281 +3992,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $12, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $16, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -4270,281 +4271,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $12, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $48, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $16, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -4552,281 +4550,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $12, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $16, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQ-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -4834,281 +4829,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $12, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: movb $48, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $16, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -5116,281 +5108,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -5398,281 +5387,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $12, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $48, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $16, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -5680,281 +5666,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $12, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $16, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQBW-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -5962,281 +5945,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $12, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: movb $48, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $16, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQBW-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -6870,70 +6850,70 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm14 +; SSE-NEXT: movaps 464(%rdi), %xmm15 ; SSE-NEXT: movaps 464(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdx), %xmm14 ; SSE-NEXT: movaps 464(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 464(%r8), %xmm11 ; SSE-NEXT: movaps 464(%r9), %xmm0 ; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm9 +; SSE-NEXT: movaps 480(%rdi), %xmm12 ; SSE-NEXT: movaps 480(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdx), %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdx), %xmm8 ; SSE-NEXT: movaps 480(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%r8), %xmm6 -; SSE-NEXT: movaps 480(%r9), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 480(%r8), %xmm5 +; SSE-NEXT: movaps 480(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm6 +; SSE-NEXT: movaps 496(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 496(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 496(%rdx), %xmm3 -; SSE-NEXT: movaps 496(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps 496(%r8), %xmm2 -; SSE-NEXT: movaps 496(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 496(%rdx), %xmm1 +; SSE-NEXT: movaps 496(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 496(%r8), %xmm0 +; SSE-NEXT: movaps 496(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 3056(%rax) -; SSE-NEXT: movaps %xmm3, 3040(%rax) -; SSE-NEXT: movaps %xmm1, 3024(%rax) -; SSE-NEXT: movaps %xmm0, 3008(%rax) +; SSE-NEXT: movaps %xmm0, 3056(%rax) +; SSE-NEXT: movaps %xmm1, 3040(%rax) +; SSE-NEXT: movaps %xmm6, 3024(%rax) +; SSE-NEXT: movaps %xmm2, 3008(%rax) ; SSE-NEXT: movaps %xmm4, 2992(%rax) ; SSE-NEXT: movaps %xmm7, 2976(%rax) -; SSE-NEXT: movaps %xmm6, 2960(%rax) -; SSE-NEXT: movaps %xmm10, 2944(%rax) -; SSE-NEXT: movaps %xmm9, 2928(%rax) -; SSE-NEXT: movaps %xmm8, 2912(%rax) -; SSE-NEXT: movaps %xmm12, 2896(%rax) +; SSE-NEXT: movaps %xmm5, 2960(%rax) +; SSE-NEXT: movaps %xmm8, 2944(%rax) +; SSE-NEXT: movaps %xmm12, 2928(%rax) +; SSE-NEXT: movaps %xmm9, 2912(%rax) +; SSE-NEXT: movaps %xmm10, 2896(%rax) ; SSE-NEXT: movaps %xmm13, 2880(%rax) ; SSE-NEXT: movaps %xmm11, 2864(%rax) -; SSE-NEXT: movaps %xmm15, 2848(%rax) -; SSE-NEXT: movaps %xmm14, 2832(%rax) +; SSE-NEXT: movaps %xmm14, 2848(%rax) +; SSE-NEXT: movaps %xmm15, 2832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 2816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7294,644 +7274,644 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm6 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm13 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -7942,7 +7922,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7954,7 +7934,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7966,7 +7946,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8572,10 +8552,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -8630,8 +8610,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm3 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8708,9 +8688,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm0 @@ -8775,8 +8755,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 @@ -8958,35 +8937,36 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 432(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm4 @@ -9007,15 +8987,13 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9031,7 +9009,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%rax) @@ -9041,11 +9019,12 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2880(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 2848(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 2816(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2784(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2784(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2656(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 2624(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 2656(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9106,8 +9085,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 928(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9214,1347 +9192,5421 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride6_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3272, %rsp # imm = 0xCC8 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm27 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512F-NEXT: vpermi2q %zmm23, %zmm24, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm25, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm25 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 -; AVX512F-NEXT: movb $12, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512F-NEXT: movb $48, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm6 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm21 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm8 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm16 = zmm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: movb $16, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm5 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm24 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm23 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm23 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm22 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm18 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm18 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm18 -; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm16 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm8 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm7 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm9 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 {%k2} -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 256(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 320(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 384(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 448(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm17, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm30, %zmm31 -; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm20 -; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm17 -; AVX512F-NEXT: vinserti32x4 $2, 256(%r8), %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm12 -; AVX512F-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm11 -; AVX512F-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm6 -; AVX512F-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm30, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm5, 3008(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 2816(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2752(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 2624(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 2496(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 2432(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 2240(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 2112(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 2048(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1856(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1664(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1280(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 2688(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 2304(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512F-NEXT: addq $3272, %rsp # imm = 0xCC8 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $12, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $48, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $16, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq +; +; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $12, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $48, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $16, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movb $12, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: movb $48, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $16, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride6_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $12, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: movb $48, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $16, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $12, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $48, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $16, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: movb $12, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $48, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $16, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride6_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3272, %rsp # imm = 0xCC8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm27 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm23, %zmm24, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 -; AVX512BW-NEXT: movb $12, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512BW-NEXT: movb $48, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm21 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm16 = zmm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: movb $16, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm23 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm23 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm18 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm18 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm8 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm7 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm9 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm17, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm31 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm20 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm17 -; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm12 -; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm11 -; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm30, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm30, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm5, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 2816(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 2624(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 2240(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1856(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512BW-NEXT: addq $3272, %rsp # imm = 0xCC8 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $12, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: movb $48, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $16, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 59857b1a3671e..7f999e06b304c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -420,14 +420,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd (%rcx), %xmm7 ; SSE-NEXT: movapd 16(%rcx), %xmm11 ; SSE-NEXT: movapd (%r8), %xmm9 -; SSE-NEXT: movapd 16(%r8), %xmm15 +; SSE-NEXT: movapd 16(%r8), %xmm14 ; SSE-NEXT: movapd (%r9), %xmm12 ; SSE-NEXT: movapd 16(%r9), %xmm13 ; SSE-NEXT: movapd (%rax), %xmm0 ; SSE-NEXT: movapd 16(%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm15 +; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -440,18 +440,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm6[0] +; SSE-NEXT: movapd %xmm5, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm11[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm14[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm13[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movapd 32(%rsi), %xmm12 ; SSE-NEXT: movapd %xmm10, %xmm15 @@ -461,38 +461,38 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 32(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 32(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 32(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 32(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 32(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 48(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 48(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 48(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 48(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 48(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 48(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 48(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 48(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 432(%rax) -; SSE-NEXT: movapd %xmm1, 416(%rax) +; SSE-NEXT: movapd %xmm2, 416(%rax) ; SSE-NEXT: movapd %xmm4, 400(%rax) -; SSE-NEXT: movapd %xmm6, 384(%rax) -; SSE-NEXT: movapd %xmm2, 368(%rax) +; SSE-NEXT: movapd %xmm5, 384(%rax) +; SSE-NEXT: movapd %xmm1, 368(%rax) ; SSE-NEXT: movapd %xmm3, 352(%rax) -; SSE-NEXT: movapd %xmm8, 336(%rax) -; SSE-NEXT: movapd %xmm5, 320(%rax) -; SSE-NEXT: movapd %xmm7, 304(%rax) +; SSE-NEXT: movapd %xmm7, 336(%rax) +; SSE-NEXT: movapd %xmm6, 320(%rax) +; SSE-NEXT: movapd %xmm8, 304(%rax) ; SSE-NEXT: movapd %xmm12, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) @@ -506,11 +506,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movapd %xmm14, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movapd %xmm14, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -538,7 +538,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] @@ -547,87 +547,87 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm15[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm10, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[2],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) @@ -647,14 +647,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] @@ -670,7 +670,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] @@ -681,31 +681,31 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -719,7 +719,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 @@ -727,11 +727,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) @@ -752,11 +752,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -765,14 +765,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -783,7 +783,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -799,10 +799,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -815,14 +815,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -833,24 +833,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -860,7 +860,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) @@ -877,10 +877,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -896,7 +896,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -907,7 +907,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -923,10 +923,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -936,7 +936,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -953,12 +953,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 @@ -969,18 +969,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -1000,78 +1000,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] ; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1088,32 +1088,32 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1124,89 +1124,89 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-FAST-NEXT: movb $28, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-FAST-NEXT: movb $56, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 @@ -1221,20 +1221,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: movb $120, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -1246,11 +1246,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -1259,14 +1259,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -1277,7 +1277,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -1293,10 +1293,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -1309,14 +1309,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -1327,24 +1327,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -1354,7 +1354,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) @@ -1371,10 +1371,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -1390,7 +1390,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -1401,7 +1401,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -1417,10 +1417,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -1430,7 +1430,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -1447,12 +1447,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 @@ -1463,18 +1463,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -1494,78 +1494,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1582,32 +1582,32 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -1618,89 +1618,89 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQBW-FAST-NEXT: movb $28, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQBW-FAST-NEXT: movb $56, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 @@ -1715,20 +1715,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: movb $120, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper @@ -1897,38 +1897,38 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 96(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 96(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 96(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 96(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 96(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 112(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 96(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 112(%rdi), %xmm5 ; SSE-NEXT: movapd 112(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 112(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 112(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 112(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 112(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 112(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 112(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 112(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 880(%rax) -; SSE-NEXT: movapd %xmm1, 864(%rax) +; SSE-NEXT: movapd %xmm2, 864(%rax) ; SSE-NEXT: movapd %xmm4, 848(%rax) -; SSE-NEXT: movapd %xmm6, 832(%rax) -; SSE-NEXT: movapd %xmm2, 816(%rax) +; SSE-NEXT: movapd %xmm5, 832(%rax) +; SSE-NEXT: movapd %xmm1, 816(%rax) ; SSE-NEXT: movapd %xmm3, 800(%rax) -; SSE-NEXT: movapd %xmm8, 784(%rax) -; SSE-NEXT: movapd %xmm5, 768(%rax) -; SSE-NEXT: movapd %xmm7, 752(%rax) +; SSE-NEXT: movapd %xmm7, 784(%rax) +; SSE-NEXT: movapd %xmm6, 768(%rax) +; SSE-NEXT: movapd %xmm8, 752(%rax) ; SSE-NEXT: movapd %xmm12, 736(%rax) ; SSE-NEXT: movapd %xmm15, 720(%rax) ; SSE-NEXT: movapd %xmm9, 704(%rax) @@ -2115,8 +2115,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2130,10 +2130,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 @@ -2153,14 +2153,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] @@ -2229,10 +2229,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 864(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -2269,74 +2269,74 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2346,1858 +2346,1847 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm8, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm3, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 576(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 512(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 832(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 736(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 704(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 576(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 544(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 512(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 ; AVX512F-ONLY-FAST-NEXT: movb $24, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $28, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $56, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 ; AVX512DQ-SLOW-NEXT: movb $48, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: movb $12, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} ; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 ; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 ; AVX512DQ-FAST-NEXT: movb $24, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512DQ-FAST-NEXT: movb $-31, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movb $28, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512DQ-FAST-NEXT: movb $56, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $28, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 ; AVX512DQBW-SLOW-NEXT: movb $48, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 ; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 ; AVX512DQBW-FAST-NEXT: movb $24, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512DQBW-FAST-NEXT: movb $-31, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: movb $28, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512DQBW-FAST-NEXT: movb $56, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4540,38 +4529,38 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 224(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 224(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 224(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 224(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 224(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 240(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 224(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 240(%rdi), %xmm5 ; SSE-NEXT: movapd 240(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 240(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 240(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 240(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 240(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 240(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 240(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 240(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 1776(%rax) -; SSE-NEXT: movapd %xmm1, 1760(%rax) +; SSE-NEXT: movapd %xmm2, 1760(%rax) ; SSE-NEXT: movapd %xmm4, 1744(%rax) -; SSE-NEXT: movapd %xmm6, 1728(%rax) -; SSE-NEXT: movapd %xmm2, 1712(%rax) +; SSE-NEXT: movapd %xmm5, 1728(%rax) +; SSE-NEXT: movapd %xmm1, 1712(%rax) ; SSE-NEXT: movapd %xmm3, 1696(%rax) -; SSE-NEXT: movapd %xmm8, 1680(%rax) -; SSE-NEXT: movapd %xmm5, 1664(%rax) -; SSE-NEXT: movapd %xmm7, 1648(%rax) +; SSE-NEXT: movapd %xmm7, 1680(%rax) +; SSE-NEXT: movapd %xmm6, 1664(%rax) +; SSE-NEXT: movapd %xmm8, 1648(%rax) ; SSE-NEXT: movapd %xmm12, 1632(%rax) ; SSE-NEXT: movapd %xmm15, 1616(%rax) ; SSE-NEXT: movapd %xmm9, 1600(%rax) @@ -4787,9 +4776,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 @@ -4876,8 +4864,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -4968,8 +4957,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -5061,14 +5050,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] @@ -5082,69 +5071,69 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 @@ -5162,13 +5151,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm15, 912(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 896(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 1696(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 1664(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 1728(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 1696(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 1664(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1632(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1600(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 1568(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1536(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1536(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5268,74 +5257,73 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5371,11 +5359,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5411,11 +5399,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5442,17 +5430,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5466,13 +5456,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -5486,9 +5476,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5500,8 +5490,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 @@ -5509,168 +5499,166 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm10[0],ymm4[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm7 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1760(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1728(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1664(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1664(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1536(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5679,10 +5667,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1312(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1312(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5693,8 +5681,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1088(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5703,7 +5691,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 928(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 864(%rcx) @@ -5717,7 +5705,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 672(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5769,443 +5757,445 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm17[0],ymm1[2],ymm17[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm20, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm16[0],ymm19[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm8[0],zmm10[0],zmm8[2],zmm10[2],zmm8[4],zmm10[4],zmm8[6],zmm10[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm12[0],zmm9[0],zmm12[2],zmm9[2],zmm12[4],zmm9[4],zmm12[6],zmm9[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm11[0],zmm25[0],zmm11[2],zmm25[2],zmm11[4],zmm25[4],zmm11[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm26[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm18, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm16[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -6214,440 +6204,439 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[2,3,2,3],zmm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k5} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $8, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -6656,876 +6645,879 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm16[0],ymm1[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm20, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm20[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm10[0],zmm11[2],zmm10[2],zmm11[4],zmm10[4],zmm11[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm22, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm14[0],zmm31[0],zmm14[2],zmm31[2],zmm14[4],zmm31[4],zmm14[6],zmm31[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm22 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm22 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm29 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $6, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm21 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm21[0],mem[0],ymm21[2],mem[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm21[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512DQ-SLOW-NEXT: movb $64, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} ; AVX512DQ-SLOW-NEXT: movb $8, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm10, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512DQ-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k3} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm1 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k5} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm19 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm9, %zmm18 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm5, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: movb $8, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQ-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7534,443 +7526,445 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm17[0],ymm1[2],ymm17[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm20, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm16[0],ymm19[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm17, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm8[0],zmm10[0],zmm8[2],zmm10[2],zmm8[4],zmm10[4],zmm8[6],zmm10[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm12[0],zmm9[0],zmm12[2],zmm9[2],zmm12[4],zmm9[4],zmm12[6],zmm9[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm11[0],zmm25[0],zmm11[2],zmm25[2],zmm11[4],zmm25[4],zmm11[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm18, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm16[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -7979,440 +7973,439 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[2,3,2,3],zmm12[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm9 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -8421,876 +8414,879 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm16, (%rsp) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm16[0],ymm1[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm20, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm20[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm10[0],zmm11[2],zmm10[2],zmm11[4],zmm10[4],zmm11[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm22, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm14[0],zmm31[0],zmm14[2],zmm31[2],zmm14[4],zmm31[4],zmm14[6],zmm31[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm4, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm22 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm22 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm29 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm21[0],mem[0],ymm21[2],mem[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm21[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512DQBW-SLOW-NEXT: movb $64, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $8, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQBW-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k3} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm1 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm19 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm9, %zmm18 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm5, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: movb $8, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQBW-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512DQBW-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -9985,38 +9981,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 480(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 480(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 480(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 480(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 480(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 496(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 480(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 496(%rdi), %xmm5 ; SSE-NEXT: movapd 496(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 496(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 496(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 496(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 496(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 496(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 496(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 496(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 3568(%rax) -; SSE-NEXT: movapd %xmm1, 3552(%rax) +; SSE-NEXT: movapd %xmm2, 3552(%rax) ; SSE-NEXT: movapd %xmm4, 3536(%rax) -; SSE-NEXT: movapd %xmm6, 3520(%rax) -; SSE-NEXT: movapd %xmm2, 3504(%rax) +; SSE-NEXT: movapd %xmm5, 3520(%rax) +; SSE-NEXT: movapd %xmm1, 3504(%rax) ; SSE-NEXT: movapd %xmm3, 3488(%rax) -; SSE-NEXT: movapd %xmm8, 3472(%rax) -; SSE-NEXT: movapd %xmm5, 3456(%rax) -; SSE-NEXT: movapd %xmm7, 3440(%rax) +; SSE-NEXT: movapd %xmm7, 3472(%rax) +; SSE-NEXT: movapd %xmm6, 3456(%rax) +; SSE-NEXT: movapd %xmm8, 3440(%rax) ; SSE-NEXT: movapd %xmm12, 3424(%rax) ; SSE-NEXT: movapd %xmm15, 3408(%rax) ; SSE-NEXT: movapd %xmm9, 3392(%rax) @@ -10446,7 +10442,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: subq $3816, %rsp # imm = 0xEE8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10943,8 +10939,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -10957,38 +10953,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 400(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm6[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovapd 416(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -11004,8 +11000,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -11207,17 +11203,17 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -11225,107 +11221,107 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm4[1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm11[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] @@ -11341,30 +11337,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2704(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 3136(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 2256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 3136(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 2256(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 896(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1792(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11573,13 +11569,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: addq $3816, %rsp # imm = 0xEE8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3896, %rsp # imm = 0xF38 +; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 @@ -11602,13 +11598,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -11647,11 +11643,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11687,11 +11683,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11727,11 +11723,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11767,11 +11763,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11807,11 +11803,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11847,11 +11843,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11887,11 +11883,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11927,11 +11923,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11967,11 +11963,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11998,18 +11994,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm10 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12028,9 +12024,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12044,11 +12040,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12075,18 +12070,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12100,13 +12096,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm14 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12138,9 +12134,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] @@ -12165,9 +12161,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm12 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] @@ -12192,10 +12188,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12335,79 +12331,78 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm10, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3552(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3520(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm12, 3552(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 3520(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 3488(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12415,18 +12410,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 3360(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 3328(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 3328(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 3232(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 3232(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3168(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3104(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 3104(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12437,8 +12432,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 2912(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 2880(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 2912(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 2880(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12447,10 +12442,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2720(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 2720(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 2656(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 2656(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12461,8 +12456,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 2464(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 2432(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 2464(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 2432(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12471,7 +12466,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 2272(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 2272(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12614,749 +12609,769 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $3896, %rsp # imm = 0xF38 +; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $6408, %rsp # imm = 0x1908 +; AVX512F-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm25[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm1[0],ymm25[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm7, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm15[0],zmm12[0],zmm15[2],zmm12[2],zmm15[4],zmm12[4],zmm15[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} -; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -13364,50 +13379,51 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -13423,9 +13439,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13433,22 +13450,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13475,35 +13492,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) @@ -13513,9 +13530,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -13524,9 +13541,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13535,16 +13552,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13555,815 +13572,820 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $6408, %rsp # imm = 0x1908 +; AVX512F-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512F-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm27[0],ymm1[0],ymm27[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm6[0],ymm25[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm18[0],ymm14[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm10[0],ymm19[2],ymm10[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm4[2,3,2,3],zmm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm23 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm2, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm17[0],zmm2[2],zmm17[2],zmm2[4],zmm17[4],zmm2[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 ; AVX512F-ONLY-FAST-NEXT: movb $4, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm14, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $64, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k3} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $8, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movb $112, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -14371,809 +14393,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm23[0,1,2,3],zmm26[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512F-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $6536, %rsp # imm = 0x1988 +; AVX512DQ-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm1[0],ymm28[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[2,3,2,3],zmm17[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm25[0],zmm0[0],zmm25[2],zmm0[2],zmm25[4],zmm0[4],zmm25[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm24[0],zmm30[0],zmm24[2],zmm30[2],zmm24[4],zmm30[4],zmm24[6],zmm30[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm8[0],zmm12[2],zmm8[2],zmm12[4],zmm8[4],zmm12[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-SLOW-NEXT: movb $4, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: movb $4, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm1 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,10,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,9,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-SLOW-NEXT: movb $8, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -15182,81 +15211,83 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm15, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm12, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} @@ -15267,43 +15298,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm21 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 ; AVX512DQ-SLOW-NEXT: movb $-61, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] @@ -15312,21 +15342,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm6 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] @@ -15334,44 +15364,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -15379,33 +15408,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) @@ -15416,8 +15445,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15426,9 +15455,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15437,16 +15466,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15457,10 +15486,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-SLOW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512DQ-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -15468,797 +15497,804 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm29[0],ymm1[0],ymm29[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm8[0],ymm27[2],ymm8[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm28[0],ymm23[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm24[0],ymm13[2],ymm24[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm12[0],ymm22[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm29 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm28, %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm12, %ymm2, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm29 -; AVX512DQ-FAST-NEXT: movb $4, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm24 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: movb $4, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} ; AVX512DQ-FAST-NEXT: movb $64, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,4,8,u,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,9,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQ-FAST-NEXT: movb $8, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm24, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm21, %zmm31 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -16272,145 +16308,146 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: movb $-61, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm23[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQ-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 @@ -16419,743 +16456,763 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $6408, %rsp # imm = 0x1908 +; AVX512BW-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm25[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm1[0],ymm25[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm7, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm15[0],zmm12[0],zmm15[2],zmm12[2],zmm15[4],zmm12[4],zmm15[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -17163,50 +17220,51 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -17222,9 +17280,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17232,22 +17291,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17274,35 +17333,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) @@ -17312,9 +17371,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -17323,9 +17382,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17334,16 +17393,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17354,815 +17413,820 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $6408, %rsp # imm = 0x1908 +; AVX512BW-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512BW-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm27[0],ymm1[0],ymm27[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm6[0],ymm25[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm18[0],ymm14[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm10[0],ymm19[2],ymm10[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm4[2,3,2,3],zmm15[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm23 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm2, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm2, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm17[0],zmm2[2],zmm17[2],zmm2[4],zmm17[4],zmm2[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: movb $4, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: movb $8, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -18170,809 +18234,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm23[0,1,2,3],zmm26[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512BW-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $6536, %rsp # imm = 0x1988 +; AVX512DQBW-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm1[0],ymm28[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[2,3,2,3],zmm17[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm25[0],zmm0[0],zmm25[2],zmm0[2],zmm25[4],zmm0[4],zmm25[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm24[0],zmm30[0],zmm24[2],zmm30[2],zmm24[4],zmm30[4],zmm24[6],zmm30[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm8[0],zmm12[2],zmm8[2],zmm12[4],zmm8[4],zmm12[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQBW-SLOW-NEXT: movb $4, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQBW-SLOW-NEXT: movb $8, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -18981,81 +19052,83 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm15, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm12, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} @@ -19066,43 +19139,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm21 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 ; AVX512DQBW-SLOW-NEXT: movb $-61, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] @@ -19111,21 +19183,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] @@ -19133,44 +19205,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -19178,33 +19249,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) @@ -19215,8 +19286,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19225,9 +19296,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19236,16 +19307,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19256,10 +19327,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512DQBW-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -19267,797 +19338,804 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm8 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm29[0],ymm1[0],ymm29[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm8[0],ymm27[2],ymm8[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm28[0],ymm23[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm24[0],ymm13[2],ymm24[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm12[0],ymm22[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm29 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm28, %ymm2, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm12, %ymm2, %ymm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 ; AVX512DQBW-FAST-NEXT: movb $4, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm24 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} ; AVX512DQBW-FAST-NEXT: movb $64, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $8, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm24, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm21, %zmm31 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -20071,145 +20149,146 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: movb $-61, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm23[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQBW-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index efb3a98dc68d7..af03a07a592e6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -154,7 +154,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps 16(%rsi), %xmm14 ; SSE-NEXT: movaps (%rdx), %xmm1 ; SSE-NEXT: movaps 16(%rdx), %xmm3 ; SSE-NEXT: movaps (%rcx), %xmm7 @@ -164,8 +164,8 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps (%r9), %xmm13 ; SSE-NEXT: movaps (%r10), %xmm6 ; SSE-NEXT: movaps 16(%r10), %xmm9 -; SSE-NEXT: movaps (%rax), %xmm14 -; SSE-NEXT: movaps 16(%rax), %xmm15 +; SSE-NEXT: movaps (%rax), %xmm15 +; SSE-NEXT: movaps 16(%rax), %xmm11 ; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -177,21 +177,21 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 16(%r9), %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps 16(%r9), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps %xmm9, 240(%rax) @@ -199,8 +199,8 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm13, 176(%rax) ; SSE-NEXT: movaps %xmm5, 96(%rax) ; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps %xmm14, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm15, 32(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) ; SSE-NEXT: movaps %xmm3, 208(%rax) ; SSE-NEXT: movaps %xmm12, 128(%rax) @@ -384,56 +384,56 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm1 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm7 -; SSE-NEXT: movaps (%rcx), %xmm2 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm6 -; SSE-NEXT: movaps 16(%r8), %xmm9 -; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps (%r10), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm2 +; SSE-NEXT: movaps (%r10), %xmm12 ; SSE-NEXT: movaps 16(%r10), %xmm15 -; SSE-NEXT: movaps (%rax), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps (%rax), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rax), %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -441,61 +441,61 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdx), %xmm11 ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movaps %xmm11, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 32(%r8), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 32(%r8), %xmm10 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 32(%r10), %xmm10 -; SSE-NEXT: movaps 32(%rax), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 32(%r10), %xmm8 +; SSE-NEXT: movaps 32(%rax), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdx), %xmm5 ; SSE-NEXT: movaps 48(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps 48(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 48(%r10), %xmm3 -; SSE-NEXT: movaps 48(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 496(%rax) +; SSE-NEXT: movaps %xmm2, 496(%rax) ; SSE-NEXT: movaps %xmm1, 480(%rax) -; SSE-NEXT: movaps %xmm2, 464(%rax) +; SSE-NEXT: movaps %xmm5, 464(%rax) ; SSE-NEXT: movaps %xmm6, 448(%rax) ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps %xmm4, 416(%rax) ; SSE-NEXT: movaps %xmm7, 400(%rax) ; SSE-NEXT: movaps %xmm9, 384(%rax) -; SSE-NEXT: movaps %xmm10, 368(%rax) -; SSE-NEXT: movaps %xmm8, 352(%rax) -; SSE-NEXT: movaps %xmm14, 336(%rax) +; SSE-NEXT: movaps %xmm8, 368(%rax) +; SSE-NEXT: movaps %xmm10, 352(%rax) +; SSE-NEXT: movaps %xmm11, 336(%rax) ; SSE-NEXT: movaps %xmm13, 320(%rax) -; SSE-NEXT: movaps %xmm11, 304(%rax) -; SSE-NEXT: movaps %xmm12, 288(%rax) +; SSE-NEXT: movaps %xmm12, 304(%rax) +; SSE-NEXT: movaps %xmm14, 288(%rax) ; SSE-NEXT: movaps %xmm15, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) @@ -611,36 +611,36 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) @@ -665,102 +665,102 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm15[0],xmm14[0] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm8 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rdx) @@ -1194,74 +1194,74 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm15 +; SSE-NEXT: movaps 80(%r10), %xmm2 ; SSE-NEXT: movaps 80(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm11 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdx), %xmm13 -; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 96(%r8), %xmm8 +; SSE-NEXT: movaps 96(%rdx), %xmm10 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 96(%r10), %xmm9 ; SSE-NEXT: movaps 96(%rax), %xmm0 ; SSE-NEXT: movaps %xmm9, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 112(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 112(%rdx), %xmm2 +; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdx), %xmm5 ; SSE-NEXT: movaps 112(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%r8), %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm3 +; SSE-NEXT: movaps 112(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 112(%r10), %xmm3 -; SSE-NEXT: movaps 112(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 112(%r10), %xmm2 +; SSE-NEXT: movaps 112(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1008(%rax) +; SSE-NEXT: movaps %xmm2, 1008(%rax) ; SSE-NEXT: movaps %xmm1, 992(%rax) -; SSE-NEXT: movaps %xmm2, 976(%rax) -; SSE-NEXT: movaps %xmm6, 960(%rax) +; SSE-NEXT: movaps %xmm5, 976(%rax) +; SSE-NEXT: movaps %xmm7, 960(%rax) ; SSE-NEXT: movaps %xmm0, 944(%rax) ; SSE-NEXT: movaps %xmm4, 928(%rax) -; SSE-NEXT: movaps %xmm7, 912(%rax) -; SSE-NEXT: movaps %xmm10, 896(%rax) +; SSE-NEXT: movaps %xmm6, 912(%rax) +; SSE-NEXT: movaps %xmm8, 896(%rax) ; SSE-NEXT: movaps %xmm9, 880(%rax) -; SSE-NEXT: movaps %xmm8, 864(%rax) -; SSE-NEXT: movaps %xmm13, 848(%rax) -; SSE-NEXT: movaps %xmm11, 832(%rax) +; SSE-NEXT: movaps %xmm11, 864(%rax) +; SSE-NEXT: movaps %xmm10, 848(%rax) +; SSE-NEXT: movaps %xmm13, 832(%rax) ; SSE-NEXT: movaps %xmm12, 816(%rax) ; SSE-NEXT: movaps %xmm14, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%rax) +; SSE-NEXT: movaps %xmm15, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) -; SSE-NEXT: movaps %xmm15, 752(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1492,50 +1492,50 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] @@ -1545,17 +1545,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -1574,8 +1574,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 704(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 672(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 640(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 672(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 640(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1584,9 +1584,9 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1597,8 +1597,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1645,10 +1645,10 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1662,22 +1662,22 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm6 @@ -1699,18 +1699,18 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] @@ -1726,15 +1726,15 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] @@ -1756,98 +1756,98 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm11 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 960(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 928(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 960(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 928(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 896(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 736(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 640(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 640(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2712,74 +2712,74 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm15 +; SSE-NEXT: movaps 208(%r10), %xmm2 ; SSE-NEXT: movaps 208(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdx), %xmm13 -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 224(%r8), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm10 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 224(%r8), %xmm12 ; SSE-NEXT: movaps 224(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 224(%r10), %xmm9 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 224(%r10), %xmm8 ; SSE-NEXT: movaps 224(%rax), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm6 -; SSE-NEXT: movaps 240(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 240(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdx), %xmm6 ; SSE-NEXT: movaps 240(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps 240(%r8), %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm3 +; SSE-NEXT: movaps 240(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 240(%r10), %xmm3 -; SSE-NEXT: movaps 240(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 240(%r10), %xmm2 +; SSE-NEXT: movaps 240(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 2032(%rax) +; SSE-NEXT: movaps %xmm2, 2032(%rax) ; SSE-NEXT: movaps %xmm1, 2016(%rax) -; SSE-NEXT: movaps %xmm2, 2000(%rax) -; SSE-NEXT: movaps %xmm6, 1984(%rax) +; SSE-NEXT: movaps %xmm6, 2000(%rax) +; SSE-NEXT: movaps %xmm5, 1984(%rax) ; SSE-NEXT: movaps %xmm0, 1968(%rax) ; SSE-NEXT: movaps %xmm4, 1952(%rax) ; SSE-NEXT: movaps %xmm7, 1936(%rax) -; SSE-NEXT: movaps %xmm10, 1920(%rax) -; SSE-NEXT: movaps %xmm9, 1904(%rax) -; SSE-NEXT: movaps %xmm8, 1888(%rax) -; SSE-NEXT: movaps %xmm13, 1872(%rax) -; SSE-NEXT: movaps %xmm11, 1856(%rax) -; SSE-NEXT: movaps %xmm12, 1840(%rax) -; SSE-NEXT: movaps %xmm14, 1824(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1808(%rax) +; SSE-NEXT: movaps %xmm9, 1920(%rax) +; SSE-NEXT: movaps %xmm8, 1904(%rax) +; SSE-NEXT: movaps %xmm12, 1888(%rax) +; SSE-NEXT: movaps %xmm10, 1872(%rax) +; SSE-NEXT: movaps %xmm14, 1856(%rax) +; SSE-NEXT: movaps %xmm11, 1840(%rax) +; SSE-NEXT: movaps %xmm13, 1824(%rax) +; SSE-NEXT: movaps %xmm15, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1792(%rax) -; SSE-NEXT: movaps %xmm15, 1776(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1776(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1760(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3007,7 +3007,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1704, %rsp # imm = 0x6A8 +; AVX1-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 @@ -3250,7 +3250,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3270,11 +3270,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -3313,8 +3313,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3358,7 +3357,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3371,67 +3370,66 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1984(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 1952(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 1984(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1952(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 1920(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rdx) @@ -3441,9 +3439,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1824(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1760(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1728(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 1696(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1760(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1728(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3454,8 +3453,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 1504(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 1472(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1504(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1472(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3468,7 +3467,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1248(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1248(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1216(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) @@ -3482,8 +3481,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 992(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 960(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 992(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 960(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3510,9 +3509,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3525,7 +3523,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) @@ -3541,7 +3539,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: addq $1704, %rsp # imm = 0x6A8 +; AVX1-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3598,11 +3596,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -3610,11 +3608,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -3847,10 +3845,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3867,10 +3865,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3910,11 +3909,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3937,76 +3936,74 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 152(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4014,54 +4011,55 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2016(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1984(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1952(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1920(%rdx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm2, 2016(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1984(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1952(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1920(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 1760(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 1504(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1472(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1728(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1696(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1664(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 1472(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1248(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 992(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 960(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 992(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 960(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 736(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 704(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4142,3901 +4140,975 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq +; AVX512F-LABEL: store_i64_stride8_vf32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-NEXT: movb $-64, %r11b +; AVX512F-NEXT: kmovw %r11d, %k1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm30 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm21 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] +; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm16 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm13 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] +; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm9 +; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1984(%rax) +; AVX512F-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1472(%rax) +; AVX512F-NEXT: vmovaps %zmm19, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b -; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: movb $-64, %r11b -; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQ-FAST-NEXT: movb $-64, %r11b -; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b -; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq -; -; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQBW-FAST-NEXT: movb $-64, %r11b -; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: store_i64_stride8_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-NEXT: movb $-64, %r11b +; AVX512BW-NEXT: kmovd %r11d, %k1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm30 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm21 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm16 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm9 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1984(%rax) +; AVX512BW-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1472(%rax) +; AVX512BW-NEXT: vmovaps %zmm19, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64 @@ -8896,74 +5968,74 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%r10), %xmm15 -; SSE-NEXT: movaps 464(%rax), %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] -; SSE-NEXT: movaps 480(%rdi), %xmm11 -; SSE-NEXT: movaps 480(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 480(%rdx), %xmm13 -; SSE-NEXT: movaps 480(%rcx), %xmm0 +; SSE-NEXT: movaps 464(%r10), %xmm2 +; SSE-NEXT: movaps 464(%rax), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps 480(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 480(%r8), %xmm10 +; SSE-NEXT: movaps 480(%rdx), %xmm10 +; SSE-NEXT: movaps 480(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 480(%r8), %xmm9 ; SSE-NEXT: movaps 480(%r9), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%r10), %xmm9 -; SSE-NEXT: movaps 480(%rax), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm6 -; SSE-NEXT: movaps 496(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 496(%rdx), %xmm2 +; SSE-NEXT: movaps 480(%r10), %xmm11 +; SSE-NEXT: movaps 480(%rax), %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps 496(%rdi), %xmm7 +; SSE-NEXT: movaps 496(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdx), %xmm5 ; SSE-NEXT: movaps 496(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 496(%r8), %xmm1 -; SSE-NEXT: movaps 496(%r9), %xmm3 +; SSE-NEXT: movaps 496(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movaps 496(%r10), %xmm3 -; SSE-NEXT: movaps 496(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movaps %xmm3, 4080(%rcx) +; SSE-NEXT: movaps 496(%r10), %xmm2 +; SSE-NEXT: movaps 496(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, 4080(%rcx) ; SSE-NEXT: movaps %xmm1, 4064(%rcx) -; SSE-NEXT: movaps %xmm2, 4048(%rcx) -; SSE-NEXT: movaps %xmm6, 4032(%rcx) +; SSE-NEXT: movaps %xmm5, 4048(%rcx) +; SSE-NEXT: movaps %xmm7, 4032(%rcx) ; SSE-NEXT: movaps %xmm0, 4016(%rcx) ; SSE-NEXT: movaps %xmm4, 4000(%rcx) -; SSE-NEXT: movaps %xmm7, 3984(%rcx) +; SSE-NEXT: movaps %xmm6, 3984(%rcx) ; SSE-NEXT: movaps %xmm8, 3968(%rcx) -; SSE-NEXT: movaps %xmm9, 3952(%rcx) -; SSE-NEXT: movaps %xmm10, 3936(%rcx) -; SSE-NEXT: movaps %xmm13, 3920(%rcx) -; SSE-NEXT: movaps %xmm11, 3904(%rcx) +; SSE-NEXT: movaps %xmm11, 3952(%rcx) +; SSE-NEXT: movaps %xmm9, 3936(%rcx) +; SSE-NEXT: movaps %xmm10, 3920(%rcx) +; SSE-NEXT: movaps %xmm13, 3904(%rcx) ; SSE-NEXT: movaps %xmm12, 3888(%rcx) ; SSE-NEXT: movaps %xmm14, 3872(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 3856(%rcx) +; SSE-NEXT: movaps %xmm15, 3856(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 3840(%rcx) -; SSE-NEXT: movaps %xmm15, 3824(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 3824(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 3808(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9478,58 +6550,58 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -9914,10 +6986,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -9934,11 +7006,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -9955,12 +7026,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10219,43 +7288,40 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 496(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 496(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -10294,17 +7360,17 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10313,12 +7379,12 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, 4064(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 4032(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 4032(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 4000(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 3968(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10330,7 +7396,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 3840(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 3808(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 3776(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 3776(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 3744(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10371,8 +7437,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3104(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3072(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 3040(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 3008(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 3040(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 3008(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2976(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10399,8 +7465,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2592(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2560(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 2528(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 2496(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 2528(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 2496(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10563,7 +7629,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3912, %rsp # imm = 0xF48 +; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 @@ -10614,11 +7680,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -10626,11 +7692,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -10698,11 +7764,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm1 @@ -10746,11 +7812,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm1 @@ -10783,9 +7849,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -10842,11 +7908,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm1 @@ -10866,11 +7932,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 392(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm1 @@ -10885,15 +7951,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm12 @@ -10905,13 +7971,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 456(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 456(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rax), %xmm6 @@ -10923,13 +7989,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%r10), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%rax), %xmm0 @@ -11062,8 +8128,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload @@ -11122,10 +8188,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 @@ -11134,20 +8199,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm3[0] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -11168,10 +8233,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11188,10 +8253,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11208,11 +8273,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11229,8 +8293,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -11278,7 +8341,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11296,7 +8359,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -11473,43 +8536,43 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 472(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 480(%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -11542,13 +8605,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11569,12 +8630,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm9, 4064(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 4032(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm9, 4000(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 4032(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 4000(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm9, 3968(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3808(%rdx) @@ -11607,14 +8667,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2720(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2528(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 2496(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 2528(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 2496(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 2272(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 2240(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11629,13 +8690,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1760(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%rdx) @@ -11811,657 +8872,643 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: addq $3912, %rsp # imm = 0xF48 +; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -12469,261 +9516,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -12751,8 +9798,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12761,8 +9808,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12772,7 +9819,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12781,8 +9828,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12791,8 +9838,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12801,8 +9848,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12812,658 +9859,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -13471,261 +10504,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -13753,8 +10786,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13763,8 +10796,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13774,7 +10807,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13783,8 +10816,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13793,8 +10826,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13803,8 +10836,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13814,658 +10847,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -14473,261 +11492,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -14755,8 +11774,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14765,8 +11784,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14776,7 +11795,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14785,8 +11804,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14795,8 +11814,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14805,8 +11824,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14816,658 +11835,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -15475,261 +12480,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -15757,8 +12762,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15767,8 +12772,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15778,7 +12783,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15787,8 +12792,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15797,8 +12802,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15807,8 +12812,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15818,658 +12823,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -16477,261 +13468,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -16759,8 +13750,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16769,8 +13760,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16780,7 +13771,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16789,8 +13780,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16799,8 +13790,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16809,8 +13800,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16820,658 +13811,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -17479,261 +14456,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -17761,8 +14738,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17771,8 +14748,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17782,7 +14759,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17791,8 +14768,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17801,8 +14778,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17811,8 +14788,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17822,658 +14799,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -18481,261 +15444,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -18763,8 +15726,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18773,8 +15736,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18784,7 +15747,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18793,8 +15756,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18803,8 +15766,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18813,8 +15776,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18824,658 +15787,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQBW-FAST-NEXT: movb $-64, %r11b ; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -19483,261 +16432,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -19765,8 +16714,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19775,8 +16724,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19786,7 +16735,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19795,8 +16744,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19805,8 +16754,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19815,8 +16764,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19826,8 +16775,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 729f2eb4b7997..06e94ccb92e69 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -595,7 +595,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm15 +; SSE-NEXT: movdqa 32(%rsi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rdx), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -626,7 +626,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] @@ -674,29 +674,29 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: movdqa %xmm15, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,2] @@ -715,8 +715,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm4 @@ -730,12 +730,12 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 @@ -763,13 +763,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] @@ -781,7 +781,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm1, %xmm7 @@ -800,13 +800,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] @@ -817,13 +817,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm14, 32(%rcx) +; SSE-NEXT: movdqa %xmm15, 32(%rcx) ; SSE-NEXT: movdqa %xmm7, 48(%rcx) ; SSE-NEXT: movdqa %xmm12, 80(%rcx) ; SSE-NEXT: movdqa %xmm5, 96(%rcx) @@ -840,119 +840,119 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: subq $24, %rsp ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 80(%rcx) +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 176(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 144(%rcx) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rcx) +; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index 65028fbbb13f1..ea7a3a0920c24 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -517,70 +517,70 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 19dd55874c471..04b9dcb418cea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -561,39 +561,38 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 ; SSE-NEXT: por %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] @@ -607,8 +606,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] @@ -622,14 +621,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 @@ -638,8 +637,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm15, %xmm11 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa %xmm7, %xmm15 @@ -650,13 +649,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm0 @@ -670,13 +669,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: pandn %xmm3, %xmm12 ; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 @@ -684,17 +683,17 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm12, %xmm13 ; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: por %xmm13, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -702,14 +701,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, 64(%r9) ; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movdqa %xmm15, 16(%r9) @@ -1085,22 +1084,21 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa 16(%r8), %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] @@ -1110,10 +1108,10 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] @@ -1124,32 +1122,34 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm0 @@ -1159,13 +1159,14 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] @@ -1173,45 +1174,45 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,2,1] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,2,3] +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm5 @@ -1219,32 +1220,33 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm4 @@ -1254,58 +1256,57 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 @@ -1315,27 +1316,27 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm14, %xmm15 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 @@ -1344,18 +1345,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] ; SSE-NEXT: movdqa %xmm3, %xmm5 @@ -1365,24 +1366,24 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm5, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pand %xmm9, %xmm2 @@ -1391,7 +1392,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm5, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] @@ -1406,9 +1407,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm3, (%r9) -; SSE-NEXT: movdqa %xmm10, 64(%r9) +; SSE-NEXT: movdqa %xmm11, 64(%r9) ; SSE-NEXT: movdqa %xmm0, 80(%r9) -; SSE-NEXT: movdqa %xmm14, 144(%r9) +; SSE-NEXT: movdqa %xmm15, 144(%r9) ; SSE-NEXT: movdqa %xmm7, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -1425,151 +1426,148 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm1[9],zero,zero,zero,zero,xmm1[10],zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3],zero,xmm10[5,6,7,8],zero,xmm10[10,11,12,13],zero,xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm13, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[5,6,7,8],zero,xmm1[10,11,12,13],zero,xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[6,u,u,u],zero,xmm15[7,u,u,u],zero,xmm15[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2227,30 +2225,31 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $504, %rsp # imm = 0x1F8 -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa 16(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] @@ -2261,24 +2260,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] @@ -2293,16 +2292,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa 16(%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rcx), %xmm0 @@ -2311,16 +2310,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,1] +; SSE-NEXT: movdqa 32(%rsi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] @@ -2332,16 +2331,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 32(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rcx), %xmm0 @@ -2350,13 +2349,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2372,36 +2371,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 48(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm1 @@ -2412,11 +2411,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm1 @@ -2429,13 +2428,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 @@ -2446,219 +2445,215 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm6 @@ -2669,84 +2664,84 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm6 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm3 @@ -2756,53 +2751,52 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: pand %xmm7, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm14[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] ; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm3, %xmm14 ; SSE-NEXT: pand %xmm2, %xmm11 @@ -2812,24 +2806,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm11 @@ -2851,15 +2845,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] ; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm1 @@ -2869,27 +2863,26 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: movdqa %xmm9, 304(%r9) ; SSE-NEXT: movdqa %xmm0, 240(%r9) ; SSE-NEXT: movdqa %xmm6, 224(%r9) ; SSE-NEXT: movdqa %xmm14, 160(%r9) -; SSE-NEXT: movdqa %xmm8, 144(%r9) +; SSE-NEXT: movdqa %xmm7, 144(%r9) ; SSE-NEXT: movdqa %xmm10, 80(%r9) ; SSE-NEXT: movdqa %xmm13, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2925,276 +2918,272 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[3,4,5,6],zero,xmm3[8,9,10,11],zero,xmm3[13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm9[6],zero,zero,zero,zero,xmm9[7],zero,zero,zero,zero,xmm9[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[3],zero,zero,zero,zero,xmm9[4],zero,zero,zero,zero,xmm9[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[12],zero,zero,zero,zero,xmm9[13],zero,zero,zero,zero,xmm9[14],zero,zero,zero,zero,xmm9[15] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm14 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1],zero,xmm8[3,4,5,6],zero,xmm8[8,9,10,11],zero,xmm8[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm3[6],zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,xmm3[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[3],zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,xmm3[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[5,6,7,8],zero,xmm2[10,11,12,13],zero,xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[0],zero,zero,zero,zero,xmm11[1],zero,zero,zero,zero,xmm11[2],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 @@ -3202,22 +3191,22 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm11[9],zero,zero,zero,zero,xmm11[10],zero,zero,zero,zero,xmm11[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3252,18 +3241,18 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: subq $312, %rsp # imm = 0x138 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -3272,208 +3261,217 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm9, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm15, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%r9) +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $312, %rsp # imm = 0x138 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $168, %rsp -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 @@ -3514,7 +3512,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3523,125 +3521,126 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm15 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm15, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -3654,23 +3653,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-NEXT: addq $168, %rsp @@ -3680,9 +3679,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> @@ -3708,7 +3707,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -3724,7 +3723,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 @@ -3735,162 +3734,162 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm15, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd $80, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp @@ -3899,368 +3898,372 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm10 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm10 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vporq %ymm1, %ymm9, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-SLOW-NEXT: vporq %xmm3, %xmm12, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm13, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm6, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vporq %xmm13, %xmm5, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm29 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm29[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm26, %ymm27, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm4[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm27, %ymm30, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm13, %zmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm14[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm15, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm26, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm14, %ymm27, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm2, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm21[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm3 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm14, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm24, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm14, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm7, %ymm13, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm22[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vpandq %ymm12, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm30, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm26 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm9[2,3,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm29[0,0,1,1] +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm7, %ymm1, %ymm12 +; AVX512F-SLOW-NEXT: vpandq %ymm7, %ymm30, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm26 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[2,3,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm25[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm28[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 192(%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm27 -; AVX512F-FAST-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm18 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm4 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX512F-FAST-NEXT: vporq %xmm3, %xmm4, %xmm19 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,ymm8[26],zero,ymm8[28],zero,ymm8[30],zero,zero,ymm8[29],zero,ymm8[31],zero,zero +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm13, %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm10, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX512F-FAST-NEXT: vporq %xmm1, %xmm15, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vporq %xmm15, %xmm5, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,1,2,2,2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm27, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> -; AVX512F-FAST-NEXT: vpermi2d %zmm31, %zmm5, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm28, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm29, %ymm28 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm2, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm14, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,1,2,2,2,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm25, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> +; AVX512F-FAST-NEXT: vpermi2d %zmm25, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <4,u,5,5,5,5,u,6> +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm26, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm26 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm25, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm20[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm0 +; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm16, %zmm10 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm14, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpandq %ymm6, %ymm29, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm4[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 192(%r9) +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpor %ymm15, %ymm12, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm20, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm11, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpandq %ymm12, %ymm30, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm7, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm27 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4283,55 +4286,55 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm3, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm19 ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4340,36 +4343,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm16, %ymm14, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm6 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm17 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm17 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm23, %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] @@ -4382,75 +4385,75 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm16[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm23, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm23[0],xmm16[1],xmm23[1],xmm16[2],xmm23[2],xmm16[3],xmm23[3],xmm16[4],xmm23[4],xmm16[5],xmm23[5],xmm16[6],xmm23[6],xmm16[7],xmm23[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm16, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm20, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm16, %xmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm7[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm1, %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm7, %ymm16, %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm4, %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -4628,55 +4631,55 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm3, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm19 ; AVX512DQBW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 ; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4685,36 +4688,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k3} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 ; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermd %ymm16, %ymm14, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 ; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm6 {%k6} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm18 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm17 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm17 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm23, %ymm26 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] @@ -4727,75 +4730,75 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm16[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 ; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm23, %xmm22 +; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm23[0],xmm16[1],xmm23[1],xmm16[2],xmm23[2],xmm16[3],xmm23[3],xmm16[4],xmm23[4],xmm16[5],xmm23[5],xmm16[6],xmm23[6],xmm16[7],xmm23[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm16, %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm20, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm21 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm16, %xmm11 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm7[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm1, %ymm7 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vporq %ymm7, %ymm16, %ymm7 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm4, %ymm4 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index aae310a41fa01..0358af331d87a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -240,60 +240,61 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf8: @@ -443,13 +444,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm13 ; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa (%r8), %xmm11 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -464,52 +465,52 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] ; SSE-NEXT: movdqa %xmm8, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm12[8],xmm2[9],xmm12[9],xmm2[10],xmm12[10],xmm2[11],xmm12[11],xmm2[12],xmm12[12],xmm2[13],xmm12[13],xmm2[14],xmm12[14],xmm2[15],xmm12[15] +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm15, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm15, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm14 @@ -524,18 +525,18 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -549,8 +550,8 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] @@ -563,16 +564,16 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 16(%rax) +; SSE-NEXT: movdqa %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm14, 32(%rax) ; SSE-NEXT: movdqa %xmm3, 48(%rax) ; SSE-NEXT: movdqa %xmm15, 80(%rax) -; SSE-NEXT: movdqa %xmm12, 64(%rax) +; SSE-NEXT: movdqa %xmm10, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq @@ -802,34 +803,34 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -838,192 +839,190 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm6 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm6, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,0,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,0,2,2,4,5,6,7] @@ -1034,44 +1033,44 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm13, %xmm7 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, 32(%rax) -; SSE-NEXT: movdqa %xmm10, 48(%rax) +; SSE-NEXT: movdqa %xmm10, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 48(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm15, 112(%rax) -; SSE-NEXT: movdqa %xmm6, 160(%rax) -; SSE-NEXT: movdqa %xmm11, 176(%rax) -; SSE-NEXT: movdqa %xmm12, (%rax) -; SSE-NEXT: movdqa %xmm14, 16(%rax) +; SSE-NEXT: movdqa %xmm5, 112(%rax) +; SSE-NEXT: movdqa %xmm15, 160(%rax) +; SSE-NEXT: movdqa %xmm6, 176(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm12, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1087,9 +1086,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1097,8 +1095,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] @@ -1111,15 +1109,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[8],zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[5],zero,zero,zero,zero,zero,xmm14[6],zero,zero,zero,zero,zero,xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] @@ -1136,114 +1134,112 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm14[10],zero,zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,xmm14[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1,2],xmm8[3],xmm15[4,5],xmm8[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0],zero,xmm8[2,3,4,5,6],zero,xmm8[8,9,10,11,12],zero,xmm8[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero @@ -1252,7 +1248,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1260,7 +1256,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) @@ -1279,127 +1275,124 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-SLOW-NEXT: pushq %rax +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[16],ymm10[16],ymm7[17],ymm10[17],ymm7[18],ymm10[18],ymm7[19],ymm10[19],ymm7[20],ymm10[20],ymm7[21],ymm10[21],ymm7[22],ymm10[22],ymm7[23],ymm10[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm15, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm15, %ymm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1411,268 +1404,281 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm9 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm13 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2125,13 +2131,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] @@ -2144,11 +2149,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 @@ -2156,19 +2162,19 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 @@ -2177,24 +2183,26 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm10 ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm8 @@ -2203,12 +2211,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2234,103 +2240,105 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa 16(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: movdqa 16(%r9), %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 ; SSE-NEXT: pand %xmm0, %xmm15 ; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 @@ -2344,42 +2352,41 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,7,7] +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdx), %xmm1 @@ -2388,17 +2395,17 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa 32(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa 32(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm12 @@ -2406,63 +2413,64 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: movdqa 32(%r9), %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 @@ -2480,118 +2488,117 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm13, %xmm8 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm9 ; SSE-NEXT: movdqa 48(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa 48(%r8), %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa 48(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa 48(%r9), %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,0,0] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm12, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm12 ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm13 ; SSE-NEXT: por %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] @@ -2601,52 +2608,52 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 368(%rax) -; SSE-NEXT: movdqa %xmm9, 352(%rax) +; SSE-NEXT: movdqa %xmm14, 368(%rax) +; SSE-NEXT: movdqa %xmm10, 352(%rax) ; SSE-NEXT: movdqa %xmm4, 336(%rax) ; SSE-NEXT: movdqa %xmm8, 320(%rax) ; SSE-NEXT: movdqa %xmm13, 304(%rax) ; SSE-NEXT: movdqa %xmm15, 288(%rax) -; SSE-NEXT: movdqa %xmm11, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2687,190 +2694,191 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-LABEL: store_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6],zero,xmm0[8,9,10,11,12],zero,xmm0[14,15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm11[13,u],zero,zero,zero,zero,xmm11[14,u],zero,zero,zero,zero,xmm11[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5],xmm13[6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[2,u],zero,zero,zero,zero,xmm11[3,u],zero,zero,zero,zero,xmm11[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -2882,174 +2890,176 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u],zero,zero,zero,zero,xmm3[11,u],zero,zero,zero,zero,xmm3[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2],xmm13[3],xmm14[4,5],xmm13[6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2],zero,xmm2[4,5,6,7,8],zero,xmm2[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3094,52 +3104,56 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 @@ -3148,257 +3162,256 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[16],ymm11[16],ymm8[17],ymm11[17],ymm8[18],ymm11[18],ymm8[19],ymm11[19],ymm8[20],ymm11[20],ymm8[21],ymm11[21],ymm8[22],ymm11[22],ymm8[23],ymm11[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11],ymm12[12],mem[12],ymm12[13],mem[13],ymm12[14],mem[14],ymm12[15],mem[15],ymm12[24],mem[24],ymm12[25],mem[25],ymm12[26],mem[26],ymm12[27],mem[27],ymm12[28],mem[28],ymm12[29],mem[29],ymm12[30],mem[30],ymm12[31],mem[31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15],ymm8[24],ymm11[24],ymm8[25],ymm11[25],ymm8[26],ymm11[26],ymm8[27],ymm11[27],ymm8[28],ymm11[28],ymm8[29],ymm11[29],ymm8[30],ymm11[30],ymm8[31],ymm11[31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15],ymm5[24],mem[24],ymm5[25],mem[25],ymm5[26],mem[26],ymm5[27],mem[27],ymm5[28],mem[28],ymm5[29],mem[29],ymm5[30],mem[30],ymm5[31],mem[31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3408,27 +3421,27 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 @@ -3437,204 +3450,206 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -3642,82 +3657,82 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3736,23 +3751,23 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 @@ -3761,204 +3776,206 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -3966,82 +3983,82 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4060,10 +4077,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4071,572 +4088,571 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm1 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm28 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15],ymm1[24],ymm15[24],ymm1[25],ymm15[25],ymm1[26],ymm15[26],ymm1[27],ymm15[27],ymm1[28],ymm15[28],ymm1[29],ymm15[29],ymm1[30],ymm15[30],ymm1[31],ymm15[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15],ymm11[24],ymm0[24],ymm11[25],ymm0[25],ymm11[26],ymm0[26],ymm11[27],ymm0[27],ymm11[28],ymm0[28],ymm11[29],ymm0[29],ymm11[30],ymm0[30],ymm11[31],ymm0[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm13[8],ymm0[8],ymm13[9],ymm0[9],ymm13[10],ymm0[10],ymm13[11],ymm0[11],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15],ymm13[24],ymm0[24],ymm13[25],ymm0[25],ymm13[26],ymm0[26],ymm13[27],ymm0[27],ymm13[28],ymm0[28],ymm13[29],ymm0[29],ymm13[30],ymm0[30],ymm13[31],ymm0[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[16],ymm15[16],ymm1[17],ymm15[17],ymm1[18],ymm15[18],ymm1[19],ymm15[19],ymm1[20],ymm15[20],ymm1[21],ymm15[21],ymm1[22],ymm15[22],ymm1[23],ymm15[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[16],ymm15[16],ymm2[17],ymm15[17],ymm2[18],ymm15[18],ymm2[19],ymm15[19],ymm2[20],ymm15[20],ymm2[21],ymm15[21],ymm2[22],ymm15[22],ymm2[23],ymm15[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm15[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm15 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm10 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm8 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm9[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm30[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm16[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm29[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm13[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm16[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm9 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm15, %ymm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm3, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm8, %ymm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm12 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm1 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm11 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm15, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm29 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm8, %ymm30 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm8, %ymm31 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm11[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm8 ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm11 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm13 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm13 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm10 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $200, %rsp -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15],ymm5[24],ymm3[24],ymm5[25],ymm3[25],ymm5[26],ymm3[26],ymm5[27],ymm3[27],ymm5[28],ymm3[28],ymm5[29],ymm3[29],ymm5[30],ymm3[30],ymm5[31],ymm3[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15],ymm15[24],ymm3[24],ymm15[25],ymm3[25],ymm15[26],ymm3[26],ymm15[27],ymm3[27],ymm15[28],ymm3[28],ymm15[29],ymm3[29],ymm15[30],ymm3[30],ymm15[31],ymm3[31] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[16],ymm2[16],ymm9[17],ymm2[17],ymm9[18],ymm2[18],ymm9[19],ymm2[19],ymm9[20],ymm2[20],ymm9[21],ymm2[21],ymm9[22],ymm2[22],ymm9[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm10[8],ymm4[8],ymm10[9],ymm4[9],ymm10[10],ymm4[10],ymm10[11],ymm4[11],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15],ymm10[24],ymm4[24],ymm10[25],ymm4[25],ymm10[26],ymm4[26],ymm10[27],ymm4[27],ymm10[28],ymm4[28],ymm10[29],ymm4[29],ymm10[30],ymm4[30],ymm10[31],ymm4[31] -; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15],ymm13[24],ymm11[24],ymm13[25],ymm11[25],ymm13[26],ymm11[26],ymm13[27],ymm11[27],ymm13[28],ymm11[28],ymm13[29],ymm11[29],ymm13[30],ymm11[30],ymm13[31],ymm11[31] +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm12, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[4],ymm6[4],ymm15[5],ymm6[5],ymm15[6],ymm6[6],ymm15[7],ymm6[7],ymm15[16],ymm6[16],ymm15[17],ymm6[17],ymm15[18],ymm6[18],ymm15[19],ymm6[19],ymm15[20],ymm6[20],ymm15[21],ymm6[21],ymm15[22],ymm6[22],ymm15[23],ymm6[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm11 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm1, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm22 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[4],ymm3[4],ymm13[5],ymm3[5],ymm13[6],ymm3[6],ymm13[7],ymm3[7],ymm13[16],ymm3[16],ymm13[17],ymm3[17],ymm13[18],ymm3[18],ymm13[19],ymm3[19],ymm13[20],ymm3[20],ymm13[21],ymm3[21],ymm13[22],ymm3[22],ymm13[23],ymm3[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[16],ymm9[16],ymm5[17],ymm9[17],ymm5[18],ymm9[18],ymm5[19],ymm9[19],ymm5[20],ymm9[20],ymm5[21],ymm9[21],ymm5[22],ymm9[22],ymm5[23],ymm9[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm15 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm20, %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm13[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm19, %ymm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm2 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm31[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm28, %ymm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm29[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm30[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm28, %ymm11 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm31[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm20[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm21[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm17[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm19[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm30[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm10, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm10 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm10 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm30 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm15[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm29 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm30[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm31 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm31[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm25[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm21[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm11 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm26[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm24[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm27[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm23[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm12, %zmm13 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm13, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm23[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm22[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm22[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-FAST-NEXT: addq $200, %rsp +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4853,43 +4869,43 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm14, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm13, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] ; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm4[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm11, %zmm11 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] @@ -4897,160 +4913,160 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3} ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm17[0],ymm7[1],ymm17[1],ymm7[2],ymm17[2],ymm7[3],ymm17[3],ymm7[4],ymm17[4],ymm7[5],ymm17[5],ymm7[6],ymm17[6],ymm7[7],ymm17[7],ymm7[16],ymm17[16],ymm7[17],ymm17[17],ymm7[18],ymm17[18],ymm7[19],ymm17[19],ymm7[20],ymm17[20],ymm7[21],ymm17[21],ymm7[22],ymm17[22],ymm7[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15],ymm13[24],ymm11[24],ymm13[25],ymm11[25],ymm13[26],ymm11[26],ymm13[27],ymm11[27],ymm13[28],ymm11[28],ymm13[29],ymm11[29],ymm13[30],ymm11[30],ymm13[31],ymm11[31] -; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm17, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm18[8],ymm17[8],ymm18[9],ymm17[9],ymm18[10],ymm17[10],ymm18[11],ymm17[11],ymm18[12],ymm17[12],ymm18[13],ymm17[13],ymm18[14],ymm17[14],ymm18[15],ymm17[15],ymm18[24],ymm17[24],ymm18[25],ymm17[25],ymm18[26],ymm17[26],ymm18[27],ymm17[27],ymm18[28],ymm17[28],ymm18[29],ymm17[29],ymm18[30],ymm17[30],ymm18[31],ymm17[31] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm14, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm17[0],ymm6[1],ymm17[1],ymm6[2],ymm17[2],ymm6[3],ymm17[3],ymm6[4],ymm17[4],ymm6[5],ymm17[5],ymm6[6],ymm17[6],ymm6[7],ymm17[7],ymm6[16],ymm17[16],ymm6[17],ymm17[17],ymm6[18],ymm17[18],ymm6[19],ymm17[19],ymm6[20],ymm17[20],ymm6[21],ymm17[21],ymm6[22],ymm17[22],ymm6[23],ymm17[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31] +; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm9, %ymm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm12 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10 ; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm10, %zmm10 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm14, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm21 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm12, %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm23 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm15, %xmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm24, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm19, %xmm20 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm20[8],xmm10[8],xmm20[9],xmm10[9],xmm20[10],xmm10[10],xmm20[11],xmm10[11],xmm20[12],xmm10[12],xmm20[13],xmm10[13],xmm20[14],xmm10[14],xmm20[15],xmm10[15] +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm17, %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm20, %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm16 +; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm26, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm16, %xmm20 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm16[0],zero,xmm16[1],zero,xmm16[2],zero,xmm16[3],zero,xmm16[4],zero,xmm16[5],zero,xmm16[6],zero,xmm16[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm28, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm13, %xmm16 +; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-FAST-NEXT: vpermw %zmm20, %zmm28, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm20 +; AVX512BW-FAST-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm20, %xmm30 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm20[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm16, %xmm30 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm30, %zmm31, %zmm30 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm31 ; AVX512BW-FAST-NEXT: vpermw %zmm30, %zmm28, %zmm30 ; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm10 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm21, %xmm30 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm23 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm23[0],xmm30[0],xmm23[1],xmm30[1],xmm23[2],xmm30[2],xmm23[3],xmm30[3],xmm23[4],xmm30[4],xmm23[5],xmm30[5],xmm23[6],xmm30[6],xmm23[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm30 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm23, %xmm21 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0],xmm30[0],xmm21[1],xmm30[1],xmm21[2],xmm30[2],xmm21[3],xmm30[3],xmm21[4],xmm30[4],xmm21[5],xmm30[5],xmm21[6],xmm30[6],xmm21[7],xmm30[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] ; AVX512BW-FAST-NEXT: vpermw %ymm30, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm30 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm21 ; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm30, %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm25[8],xmm21[8],xmm25[9],xmm21[9],xmm25[10],xmm21[10],xmm25[11],xmm21[11],xmm25[12],xmm21[12],xmm25[13],xmm21[13],xmm25[14],xmm21[14],xmm25[15],xmm21[15] ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3],xmm30[4],xmm31[4],xmm30[5],xmm31[5],xmm30[6],xmm31[6],xmm30[7],xmm31[7] ; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm23 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} ; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm25, %xmm24 ; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm26 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm23 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm26, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm27, %zmm24 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm23 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm17 = ymm18[0],ymm17[0],ymm18[1],ymm17[1],ymm18[2],ymm17[2],ymm18[3],ymm17[3],ymm18[4],ymm17[4],ymm18[5],ymm17[5],ymm18[6],ymm17[6],ymm18[7],ymm17[7],ymm18[16],ymm17[16],ymm18[17],ymm17[17],ymm18[18],ymm17[18],ymm18[19],ymm17[19],ymm18[20],ymm17[20],ymm18[21],ymm17[21],ymm18[22],ymm17[22],ymm18[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[16],ymm11[16],ymm13[17],ymm11[17],ymm13[18],ymm11[18],ymm13[19],ymm11[19],ymm13[20],ymm11[20],ymm13[21],ymm11[21],ymm13[22],ymm11[22],ymm13[23],ymm11[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] -; AVX512BW-FAST-NEXT: vpermw %zmm17, %zmm18, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm13, %zmm1 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm18, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm24, %xmm26 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm27, %zmm26 +; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm21 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm18 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm18, %zmm19, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm19, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm11 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm5, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 61997b1350497..0909c3f43ea37 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -288,100 +288,100 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-LABEL: store_i8_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm14 = mem[0],zero +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,0] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,0] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] ; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: packuswb %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,2,3] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm14, %xmm6 ; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm9, %xmm14 ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: packuswb %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: packuswb %xmm2, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: por %xmm9, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] ; SSE-NEXT: pand %xmm2, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm2 @@ -390,17 +390,17 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: packuswb %xmm13, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,0,0] -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] @@ -409,41 +409,40 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm0, 48(%rax) -; SSE-NEXT: movdqa %xmm9, 16(%rax) +; SSE-NEXT: movq %xmm2, 48(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) ; SSE-NEXT: movdqa %xmm12, 32(%rax) ; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq @@ -812,23 +811,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm9 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -839,139 +839,141 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pandn %xmm4, %xmm10 ; SSE-NEXT: por %xmm3, %xmm10 ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa (%rax), %xmm7 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,0] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm14, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,6,6,6] +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -979,168 +981,166 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: por %xmm0, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] ; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] -; SSE-NEXT: pshuflw $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] +; SSE-NEXT: pandn %xmm7, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) @@ -1153,7 +1153,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf16: @@ -1816,694 +1816,693 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm7 -; SSE-NEXT: movdqa 16(%r9), %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa 16(%r9), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa 16(%rax), %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa (%rax), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa (%rax), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 ; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm7, 112(%rax) ; SSE-NEXT: movdqa %xmm14, 176(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2522,268 +2521,272 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $216, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm2, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm10[4,u,u,u,u],zero,zero,xmm10[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u],zero,xmm4[7,u,u,u,u,u],zero,xmm4[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm11, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[8,9],zero,xmm6[u,u,u,u,10,11],zero,xmm6[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0],zero,xmm11[2,3,4,5,6,7],zero,xmm11[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[13,u,u,u,u],zero,zero,xmm10[14,u,u,u,u],zero,zero,xmm10[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $216, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3025,12 +3028,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] @@ -3061,12 +3064,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -3085,14 +3088,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 @@ -3110,10 +3113,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 @@ -3127,10 +3130,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm7[1,2,3,0,1,14],zero,ymm7[0,1,0,1,14,15],zero,ymm7[15,16,17,18,19,16],zero,ymm7[30,31,16,17,16,17],zero,ymm7[31,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] @@ -3194,12 +3197,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] @@ -3229,14 +3232,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 @@ -3254,10 +3257,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 @@ -3277,10 +3280,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 @@ -3294,10 +3297,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm7[1,2,3,0,1,14],zero,ymm7[0,1,0,1,14,15],zero,ymm7[15,16,17,18,19,16],zero,ymm7[30,31,16,17,16,17],zero,ymm7[31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] @@ -3942,16 +3945,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: movdqa 48(%rcx), %xmm9 -; SSE-NEXT: movdqa 48(%r8), %xmm14 +; SSE-NEXT: movdqa 48(%rcx), %xmm10 +; SSE-NEXT: movdqa 48(%r8), %xmm5 ; SSE-NEXT: movdqa 48(%r9), %xmm8 -; SSE-NEXT: movdqa 48(%rax), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm0 @@ -3963,15 +3964,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] @@ -3979,20 +3980,22 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] @@ -4001,7 +4004,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 @@ -4011,22 +4014,22 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] @@ -4034,42 +4037,43 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm11 @@ -4078,14 +4082,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] @@ -4093,38 +4099,39 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -4138,84 +4145,87 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm9 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa (%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa (%r9), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa (%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa (%rax), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa (%rax), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4224,1065 +4234,1072 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 16(%r9), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa 32(%rdx), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa 32(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa 32(%r9), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%rax), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: pandn %xmm6, %xmm10 ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 ; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 ; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm15, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm14, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,6,7] +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm15, %xmm9 ; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,1,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,2,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,2] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] -; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,5,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm3, 368(%rax) -; SSE-NEXT: movdqa %xmm11, 352(%rax) -; SSE-NEXT: movdqa %xmm10, 336(%rax) +; SSE-NEXT: movdqa %xmm1, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rax) @@ -5299,7 +5316,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) @@ -5336,7 +5353,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] @@ -5349,69 +5366,72 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 @@ -5424,123 +5444,119 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,6,7],zero,xmm3[u,u,u,u,8,9],zero,xmm3[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,2,3],zero,xmm3[u,u,u,u,4,5],zero,xmm3[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u],zero,zero,xmm11[2,u,u,u,u],zero,zero,xmm11[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 @@ -5565,188 +5581,189 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u],zero,zero,xmm2[2,u,u,u,u],zero,zero,xmm2[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,4,5],zero,xmm3[u,u,u,u,6,7],zero,xmm3[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,xmm12[7,u,u,u,u,u],zero,xmm12[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,7],zero,xmm4[u,u,u,u,u,8],zero,xmm4[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[u,u,6,7,8,9],zero,xmm1[u,u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm4[9,u,u],zero,zero,zero,zero,xmm4[10,u,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[u,6,7,8,9,10],zero,xmm1[u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero,zero,xmm2[10,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10,11],zero,xmm1[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm11[9],zero,zero,zero,zero,zero,zero,xmm11[10],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4],zero,xmm0[u,u,8,9,10,11],zero,xmm0[u,u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u],zero,zero,zero,zero,xmm4[7,u,u],zero,zero,zero,zero,xmm4[8,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4,5],zero,xmm0[u,8,9,10,11,12],zero,xmm0[u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,zero,zero,zero,zero,xmm2[7,u],zero,zero,zero,zero,zero,xmm2[8,u],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6],zero,xmm0[8,9,10,11,12,13],zero,xmm0[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[6],zero,zero,zero,zero,zero,zero,xmm11[7],zero,zero,zero,zero,zero,zero,xmm11[8],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm5[11,u,u],zero,zero,zero,zero,xmm5[12,u,u],zero,zero,zero,zero,xmm5[13] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[u,4,5,6,7,8],zero,xmm1[u,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[11,u],zero,zero,zero,zero,zero,xmm2[12,u],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8,9],zero,xmm1[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[11],zero,zero,zero,zero,zero,zero,xmm11[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[13,u],zero,zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm11[13],zero,zero,zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,zero,zero,xmm11[15] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 @@ -5755,111 +5772,110 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[13,u,u,u,u],zero,zero,xmm13[14,u,u,u,u],zero,zero,xmm13[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[4,5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5880,13 +5896,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: subq $824, %rsp # imm = 0x338 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5971,339 +5987,340 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm15, %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm9 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,ymm6[27,20,21,26],zero,ymm6[24],zero,ymm6[26,27,26,27],zero,ymm6[25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm10 ; AVX2-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6326,8 +6343,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) -; AVX2-SLOW-NEXT: addq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-SLOW-NEXT: addq $824, %rsp # imm = 0x338 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -6336,23 +6353,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6360,18 +6378,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6388,361 +6406,358 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[23],zero,ymm11[27,20,21,26],zero,ymm11[24],zero,ymm11[26,27,26,27],zero,ymm11[25] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpor %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6774,23 +6789,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6798,18 +6814,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6826,236 +6842,244 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm15[8],mem[8],xmm15[9],mem[9],xmm15[10],mem[10],xmm15[11],mem[11],xmm15[12],mem[12],xmm15[13],mem[13],xmm15[14],mem[14],xmm15[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,18,19,30],zero,ymm12[28],zero,ymm12[28,29,30,31],zero,ymm12[29],zero,ymm12[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[23],zero,ymm15[27,20,21,26],zero,ymm15[24],zero,ymm15[26,27,26,27],zero,ymm15[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,ymm5[27,20,21,26],zero,ymm5[24],zero,ymm5[26,27,26,27],zero,ymm5[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 @@ -7074,53 +7098,53 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm13, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 @@ -7129,48 +7153,48 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) @@ -7198,482 +7222,472 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-SLOW-LABEL: store_i8_stride7_vf64: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $1368, %rsp # imm = 0x558 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm7[23],zero,ymm7[21,22,23,26],zero,ymm7[24],zero,ymm7[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm3, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,0,1],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm1, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm12[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm22, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm30[2,3,2,3] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm10[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm19[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm4, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm9, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm24[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm17 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm22[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm4, %ymm16, %ymm15 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm4, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm18, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm20 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm28[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,0,1,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm5 -; AVX512F-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vporq %ymm22, %ymm24, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm25[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm20[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm12[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-SLOW-NEXT: addq $1368, %rsp # imm = 0x558 -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i8_stride7_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vporq %xmm1, %xmm3, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm15, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm15[23],zero,ymm15[21,22,23,26],zero,ymm15[24],zero,ymm15[28,29,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm15[18,19,20,21],zero,ymm15[19],zero,ymm15[25,26,27,22],zero,ymm15[20],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm23[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm19, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm30[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm31[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm21, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm19, %ymm10, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm28[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = zmm10[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, (%rsp), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,1,0] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,2,3,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm23, %ymm26, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm19[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,0,1,0,4,4,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -7682,438 +7696,891 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; +; AVX512DQ-SLOW-LABEL: store_i8_stride7_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-SLOW-NEXT: vporq %xmm1, %xmm3, %xmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm15, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm15[23],zero,ymm15[21,22,23,26],zero,ymm15[24],zero,ymm15[28,29,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm15[18,19,20,21],zero,ymm15[19],zero,ymm15[25,26,27,22],zero,ymm15[20],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm23[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpandq %ymm19, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm30[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm31[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm21, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm19, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm28[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm21 = zmm10[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[1,1,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, (%rsp), %ymm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,1,0] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm0 +; AVX512DQ-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,2,3,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vporq %ymm23, %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm19[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,0,1,0,4,4,5,4] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; ; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -8122,407 +8589,422 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512DQ-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] ; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -8537,9 +9019,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermw %ymm13, %ymm2, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm3 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] ; AVX512BW-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm5 @@ -8551,16 +9033,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm17, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm18, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm18, %ymm7 +; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm17, %ymm7 ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm29 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm23 @@ -8568,31 +9050,31 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm22 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm21, %ymm7 +; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm7 ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm16[8],xmm22[8],xmm16[9],xmm22[9],xmm16[10],xmm22[10],xmm16[11],xmm22[11],xmm16[12],xmm22[12],xmm16[13],xmm22[13],xmm16[14],xmm22[14],xmm16[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm16[8],xmm20[8],xmm16[9],xmm20[9],xmm16[10],xmm20[10],xmm16[11],xmm20[11],xmm16[12],xmm20[12],xmm16[13],xmm20[13],xmm16[14],xmm20[14],xmm16[15],xmm20[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm25 = xmm25[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm7 ; AVX512BW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm7 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm11 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm10 ; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm28 ; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm28, %ymm2 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm31 @@ -8600,31 +9082,31 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[20],zero,ymm28[18],zero,zero,zero,zero,ymm28[21],zero,ymm28[19],zero,zero,zero,zero,ymm28[22] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[20],zero,ymm28[18],zero,zero,zero,zero,ymm28[21],zero,ymm28[19],zero,zero,zero,zero,ymm28[22] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm4 ; AVX512BW-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm25[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm25[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: movl $676341840, %esi # imm = 0x28502850 ; AVX512BW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm11 -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm2, %ymm19 -; AVX512BW-SLOW-NEXT: vporq %ymm11, %ymm19, %ymm11 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm19 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm5[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm10 +; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm5, %ymm19 +; AVX512BW-SLOW-NEXT: vporq %ymm10, %ymm19, %ymm10 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm19 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 ; AVX512BW-SLOW-NEXT: vporq %ymm19, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] @@ -8632,25 +9114,25 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm3, %zmm3 ; AVX512BW-SLOW-NEXT: movabsq $3485998880071096368, %rsi # imm = 0x3060C183060C1830 ; AVX512BW-SLOW-NEXT: kmovq %rsi, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k3} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k3} ; AVX512BW-SLOW-NEXT: movabsq $-4357498600088870461, %rsi # imm = 0xC3870E1C3870E1C3 ; AVX512BW-SLOW-NEXT: kmovq %rsi, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm21[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512BW-SLOW-NEXT: movl $338170920, %esi # imm = 0x14281428 ; AVX512BW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm3 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm22, %ymm3 {%k2} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm26 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm18[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm18, %ymm4 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm17, %ymm4 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 @@ -8660,7 +9142,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm26, %zmm19 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm26 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm26 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] @@ -8682,24 +9164,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm19 {%k3} ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm0 {%k2} ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] ; AVX512BW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] ; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm25 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm2, %ymm25, %ymm2 +; AVX512BW-SLOW-NEXT: vporq %ymm5, %ymm25, %ymm5 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm25 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm25 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -8724,20 +9206,20 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm25 {%k3} -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm20[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm21[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm21[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm6 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm22[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm22, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm6 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm5, %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm5[18,19,20,21],zero,zmm5[19],zero,zmm5[25,26,27,22],zero,zmm5[20],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm5[55],zero,zmm5[53,54,55,58],zero,zmm5[56],zero,zmm5[60,61,58,59] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[18],zero,zero,zero,zero,zmm6[21],zero,zmm6[19],zero,zero,zero,zero,zmm6[22],zero,zmm6[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm6[57],zero,zmm6[55],zero,zero,zero,zero,zmm6[58],zero,zmm6[56],zero,zero,zero,zero @@ -8747,7 +9229,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm1 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm0[20],zero,zmm0[18],zero,zero,zero,zero,zmm0[21],zero,zmm0[19],zero,zero,zero,zero,zmm0[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 @@ -8760,82 +9242,82 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm7 -; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm8 +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> ; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm7 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm8 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm29[0],xmm23[0],xmm29[1],xmm23[1],xmm29[2],xmm23[2],xmm29[3],xmm23[3],xmm29[4],xmm23[4],xmm29[5],xmm23[5],xmm29[6],xmm23[6],xmm29[7],xmm23[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm20, %xmm5 ; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm16[0],xmm22[1],xmm16[1],xmm22[2],xmm16[2],xmm22[3],xmm16[3],xmm22[4],xmm16[4],xmm22[5],xmm16[5],xmm22[6],xmm16[6],xmm22[7],xmm16[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 384(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 192(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -8845,24 +9327,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $200, %rsp ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512BW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm14, %ymm7 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm15, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm8 @@ -8879,89 +9361,89 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm1, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm14[8],xmm16[9],xmm14[9],xmm16[10],xmm14[10],xmm16[11],xmm14[11],xmm16[12],xmm14[12],xmm16[13],xmm14[13],xmm16[14],xmm14[14],xmm16[15],xmm14[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm26 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm12 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm13, %ymm17 -; AVX512BW-FAST-NEXT: vporq %ymm8, %ymm17, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm12, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm6, %ymm18, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm27 = xmm27[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm27, %zmm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm27, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm22, %ymm22 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm1, %ymm18 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm27 -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm27, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm27[20],zero,ymm27[18],zero,ymm27[20,21,20,21],zero,ymm27[19],zero,ymm27[19,20,21,22],zero +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm27, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm20 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm27[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm27[20],zero,ymm27[18],zero,zero,zero,zero,ymm27[21],zero,ymm27[19],zero,zero,zero,zero,ymm27[22] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm26 ; AVX512BW-FAST-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm26 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm30, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm21 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm0, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[18,19,20,21],zero,ymm31[19],zero,ymm31[21,20,21,22],zero,ymm31[20],zero,ymm31[22,23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm23, %ymm20 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512BW-FAST-NEXT: kmovq %r10, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm17 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 ; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm17 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 ; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm23[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm22[23],zero,zmm22[21,22,23,26],zero,zmm22[24],zero,zmm22[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero @@ -8971,10 +9453,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm28, %zmm29 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[27],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm0[60],zero,zmm0[62,63,62,63],zero,zmm0[61],zero,zmm0[63,60,61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm22[23],zero,zero,zero,zero,zmm22[26],zero,zmm22[24],zero,zero,zero,zero,zmm22[27],zero,zmm22[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm22, %zmm22 @@ -8982,10 +9464,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm29, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm27[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm21[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm27[0,1,2,3],zmm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 @@ -8993,44 +9475,44 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm11[28],zero,ymm11[30,31,30,31],zero,ymm11[29],zero,ymm11[31,28,29] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm0, %ymm27, %ymm27 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm31 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm1[0],xmm31[1],xmm1[1],xmm31[2],xmm1[2],xmm31[3],xmm1[3],xmm31[4],xmm1[4],xmm31[5],xmm1[5],xmm31[6],xmm1[6],xmm31[7],xmm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm27 {%k2} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] @@ -9069,7 +9551,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm28, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm4 +; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm4 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 @@ -9078,47 +9560,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm24 {%k2} ; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3],xmm14[4],xmm16[4],xmm14[5],xmm16[5],xmm14[6],xmm16[6],xmm14[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 ; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm19, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm17, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm18, %xmm3 ; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm18[0],xmm19[0],xmm18[1],xmm19[1],xmm18[2],xmm19[2],xmm18[3],xmm19[3],xmm18[4],xmm19[4],xmm18[5],xmm19[5],xmm18[6],xmm19[6],xmm18[7],xmm19[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm1 -; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm0[19],zero,zmm0[21,20,21,22],zero,zmm0[20],zero,zmm0[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero,zero,zero,zmm0[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 32-byte Folded Reload @@ -9127,30 +9609,30 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm21, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm4, %zmm0 +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm4, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm4, %zmm2 ; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) ; AVX512BW-FAST-NEXT: addq $200, %rsp ; AVX512BW-FAST-NEXT: vzeroupper @@ -9180,10 +9662,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index cfb971dd16217..f8a3ab852b092 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1177,16 +1177,17 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $232, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r8), %xmm13 ; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa (%r10), %xmm14 @@ -1197,66 +1198,64 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -1265,139 +1264,142 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa 16(%r10), %xmm11 +; SSE-NEXT: movdqa 16(%r10), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%rax), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%r8), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm12 +; SSE-NEXT: movdqa 16(%r9), %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -1406,42 +1408,42 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -1449,133 +1451,133 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm5, 240(%rax) +; SSE-NEXT: movdqa %xmm10, 240(%rax) ; SSE-NEXT: movdqa %xmm4, 160(%rax) -; SSE-NEXT: movdqa %xmm9, 176(%rax) +; SSE-NEXT: movdqa %xmm2, 176(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) ; SSE-NEXT: movdqa %xmm3, 112(%rax) ; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) @@ -1593,7 +1595,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: addq $232, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf32: @@ -1601,10 +1603,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: subq $72, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1612,8 +1613,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 @@ -1622,237 +1623,236 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -1889,151 +1889,151 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 ; AVX2-SLOW-NEXT: vmovaps 16(%r10), %xmm8 ; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7,8],ymm14[9],ymm10[10,11,12],ymm14[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4],ymm0[5],ymm14[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7,8],ymm13[9],ymm12[10,11,12],ymm13[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2],ymm1[3],ymm12[4],ymm1[5],ymm12[6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7,8],ymm13[9],ymm1[10,11,12],ymm13[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7,8],ymm9[9],ymm7[10,11,12],ymm9[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3],ymm12[4,5,6],ymm7[7],ymm12[8,9,10],ymm7[11],ymm12[12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm13, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3],ymm13[4,5,6],ymm7[7],ymm13[8,9,10],ymm7[11],ymm13[12,13,14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] @@ -2041,13 +2041,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] @@ -2098,134 +2098,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: subq $72, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2251,134 +2251,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2408,137 +2408,137 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7],ymm7[8,9,10],ymm1[11],ymm7[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 @@ -2556,14 +2556,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] @@ -2581,8 +2581,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] @@ -2609,12 +2609,12 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -2633,8 +2633,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 @@ -2647,19 +2647,19 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 @@ -2673,63 +2673,63 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3],ymm15[4,5,6],ymm0[7],ymm15[8,9,10],ymm0[11],ymm15[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7,8],ymm8[9],ymm15[10,11,12],ymm8[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 @@ -2742,13 +2742,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 @@ -2756,11 +2756,11 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm6 @@ -2771,7 +2771,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} @@ -3017,21 +3017,21 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] @@ -3055,69 +3055,70 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm29 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6],ymm1[7],ymm14[8,9,10],ymm1[11],ymm14[12,13,14],ymm1[15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5,6],ymm0[7],ymm8[8,9,10],ymm0[11],ymm8[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6],ymm0[7],ymm14[8,9,10],ymm0[11],ymm14[12,13,14],ymm0[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 @@ -3127,26 +3128,26 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3564,25 +3565,25 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa 16(%r9), %xmm9 +; SSE-NEXT: movdqa 16(%r9), %xmm6 ; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa 16(%rdx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm4, %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 @@ -3657,24 +3658,24 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] @@ -3737,34 +3738,34 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 -; SSE-NEXT: movdqa 32(%rax), %xmm9 +; SSE-NEXT: movdqa 32(%rax), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa 32(%r8), %xmm3 -; SSE-NEXT: movdqa 32(%r9), %xmm8 +; SSE-NEXT: movdqa 32(%r9), %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movdqa 32(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rcx), %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm5, %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm6 @@ -3833,27 +3834,27 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] @@ -3925,28 +3926,28 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 48(%r8), %xmm7 -; SSE-NEXT: movdqa 48(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 48(%r8), %xmm8 +; SSE-NEXT: movdqa 48(%r9), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: movdqa 48(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: movdqa 48(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] @@ -3960,11 +3961,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm10 ; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 @@ -3979,11 +3980,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm10 ; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm14, %xmm2 @@ -3998,18 +3999,18 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] @@ -4019,85 +4020,85 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pand %xmm13, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 496(%rax) -; SSE-NEXT: movdqa %xmm5, 480(%rax) +; SSE-NEXT: movdqa %xmm3, 480(%rax) ; SSE-NEXT: movdqa %xmm0, 464(%rax) -; SSE-NEXT: movdqa %xmm3, 448(%rax) +; SSE-NEXT: movdqa %xmm4, 448(%rax) ; SSE-NEXT: movdqa %xmm6, 432(%rax) ; SSE-NEXT: movdqa %xmm10, 416(%rax) ; SSE-NEXT: movdqa %xmm15, 400(%rax) @@ -4168,357 +4169,357 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm13[1],ymm2[2],ymm13[3],ymm2[4],ymm13[5],ymm2[6],ymm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4],ymm15[5],ymm3[6],ymm15[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] @@ -4529,131 +4530,131 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4],ymm2[5],ymm6[6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4759,41 +4760,41 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] @@ -4810,12 +4811,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] @@ -4832,64 +4833,64 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] @@ -4899,148 +4900,148 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7],ymm2[8,9,10],ymm5[11],ymm2[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7,8],ymm14[9],ymm2[10,11,12],ymm14[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm11[1],ymm2[2],ymm11[3],ymm2[4],ymm11[5],ymm2[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7,8],ymm0[9],ymm8[10,11,12],ymm0[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] @@ -5066,12 +5067,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7,8],ymm10[9],ymm6[10,11,12],ymm10[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7,8],ymm9[9],ymm6[10,11,12],ymm9[13],ymm6[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -5096,10 +5097,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5143,281 +5144,281 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -5467,281 +5468,281 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -5774,483 +5775,479 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm16 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm14, %ymm28 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm14 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm14, %ymm31 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm24 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm20 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm18 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm30 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm23 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm7, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm13, %ymm19 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm13, %ymm17 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm27 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm24 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm8, %zmm4 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm7, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm5, %zmm6 {%k1} ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm16 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm16 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm4, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm14 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm9 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd $96, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm3, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm25 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm10, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpandnq %zmm5, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpord %zmm5, %zmm7, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm21 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm14 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm10, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm10, %ymm17 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm10, %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm30, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpandnq %zmm13, %zmm8, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm23, %zmm23 +; AVX512F-SLOW-NEXT: vpord %zmm13, %zmm23, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm12[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm24, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm10[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm31[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm28, %zmm28 -; AVX512F-SLOW-NEXT: vpord %zmm7, %zmm28, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm15[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm30, %ymm28 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm12[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm5, %zmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512F-SLOW-NEXT: vpandnq %zmm22, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpord %zmm19, %zmm18, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm18, %ymm18 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm31, %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm5, %zmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm28, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 +; AVX512F-SLOW-NEXT: vpandnq %zmm25, %zmm8, %zmm25 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpord %zmm25, %zmm16, %zmm13 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm29, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 +; AVX512F-SLOW-NEXT: vpandnq %zmm10, %zmm8, %zmm10 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpord %zmm10, %zmm7, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6259,294 +6256,288 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm14 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm17, %zmm17 +; AVX512F-FAST-NEXT: vpandnq %zmm19, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm21 ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpord %zmm29, %zmm17, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpord %zmm19, %zmm21, %zmm4 {%k1} ; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm17 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm28, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm15 +; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm5, %ymm19, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpandnq %zmm20, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm28, %zmm0 -; AVX512F-FAST-NEXT: vpord %zmm6, %zmm0, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm14 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm11 +; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm3, %zmm14 +; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpandnq %zmm28, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm29, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm5 +; AVX512F-FAST-NEXT: vpandnq %zmm13, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm0 +; AVX512F-FAST-NEXT: vpord %zmm3, %zmm0, %zmm5 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512F-FAST-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -6557,226 +6548,228 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm22 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm19 ; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rax), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm23 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rax), %xmm20 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r9), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm24 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] ; AVX512BW-SLOW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm30 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] ; AVX512BW-SLOW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm7, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm15 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm10, %zmm14 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm9, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm18, %ymm18 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm31[0],xmm1[1],xmm31[1],xmm1[2],xmm31[2],xmm1[3],xmm31[3],xmm1[4],xmm31[4],xmm1[5],xmm31[5],xmm1[6],xmm31[6],xmm1[7],xmm31[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm27, %ymm27 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm27, %zmm27 +; AVX512BW-SLOW-NEXT: vpermw %zmm27, %zmm12, %zmm15 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm31 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm21 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm20, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm20[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm20[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm10, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm9, %zmm19 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm26, %ymm20 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm20 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm25, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm25 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm30[0],xmm0[1],xmm30[1],xmm0[2],xmm30[2],xmm0[3],xmm30[3],xmm0[4],xmm30[4],xmm0[5],xmm30[5],xmm0[6],xmm30[6],xmm0[7],xmm30[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm1[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm26, %ymm26 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm21 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm21 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm28, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm31 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm1, %zmm26 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm26 {%k2} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm24 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm23 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm30 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm7, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm21 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm30[8],xmm0[9],xmm30[9],xmm0[10],xmm30[10],xmm0[11],xmm30[11],xmm0[12],xmm30[12],xmm0[13],xmm30[13],xmm0[14],xmm30[14],xmm0[15],xmm30[15] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm22 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm22 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm22 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm23, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm22 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm23 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm24 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm24 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm26 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm26 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm30 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm27 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vpermw %zmm3, %zmm7, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm8, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm27 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm10, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm9, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX512BW-SLOW-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm9, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm14, %zmm0 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm12, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX512BW-SLOW-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm14, %zmm4 {%k2} +; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm12, %zmm2 {%k2} ; AVX512BW-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm15 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm25, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm24, %zmm27 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm24 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm22, %zmm23 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm26, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -6785,7 +6778,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm14 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%r10), %xmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%r10), %xmm17 ; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm1 @@ -6820,16 +6813,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rsi), %xmm25 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm28 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm28 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm26 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm27 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm8, %zmm8 ; AVX512BW-FAST-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm8 {%k2} @@ -6845,22 +6838,22 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm16, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm25[0],xmm28[1],xmm25[1],xmm28[2],xmm25[2],xmm28[3],xmm25[3],xmm28[4],xmm25[4],xmm28[5],xmm25[5],xmm28[6],xmm25[6],xmm28[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm16, %xmm26 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm27, %ymm26 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm30 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm16, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm26, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm26 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm16 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm24 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] @@ -6872,15 +6865,15 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm29 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm28[8],xmm25[8],xmm28[9],xmm25[9],xmm28[10],xmm25[10],xmm28[11],xmm25[11],xmm28[12],xmm25[12],xmm28[13],xmm25[13],xmm28[14],xmm25[14],xmm28[15],xmm25[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm23, %ymm23 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm17, %xmm25 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 @@ -6890,96 +6883,96 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm21[0],xmm28[1],xmm21[1],xmm28[2],xmm21[2],xmm28[3],xmm21[3],xmm28[4],xmm21[4],xmm28[5],xmm21[5],xmm28[6],xmm21[6],xmm28[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm20, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm21[0],xmm29[1],xmm21[1],xmm29[2],xmm21[2],xmm29[3],xmm21[3],xmm29[4],xmm21[4],xmm29[5],xmm21[5],xmm29[6],xmm21[6],xmm29[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm23 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm28 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm20, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm23, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm30[0],xmm24[0],xmm30[1],xmm24[1],xmm30[2],xmm24[2],xmm30[3],xmm24[3],xmm30[4],xmm24[4],xmm30[5],xmm24[5],xmm30[6],xmm24[6],xmm30[7],xmm24[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm23, %zmm23 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm23, %zmm23 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm20 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm23 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm20 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm25 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm22 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm25, %zmm25 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm26, %zmm27 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm24[8],xmm30[9],xmm24[9],xmm30[10],xmm24[10],xmm30[11],xmm24[11],xmm30[12],xmm24[12],xmm30[13],xmm24[13],xmm30[14],xmm24[14],xmm30[15],xmm24[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm21[8],xmm28[9],xmm21[9],xmm28[10],xmm21[10],xmm28[11],xmm21[11],xmm28[12],xmm21[12],xmm28[13],xmm21[13],xmm28[14],xmm21[14],xmm28[15],xmm21[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm29[8],xmm21[8],xmm29[9],xmm21[9],xmm29[10],xmm21[10],xmm29[11],xmm21[11],xmm29[12],xmm21[12],xmm29[13],xmm21[13],xmm29[14],xmm21[14],xmm29[15],xmm21[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm18, %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm28 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm18 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm21, %zmm21 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm23[0],xmm26[1],xmm23[1],xmm26[2],xmm23[2],xmm26[3],xmm23[3],xmm26[4],xmm23[4],xmm26[5],xmm23[5],xmm26[6],xmm23[6],xmm26[7],xmm23[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm21, %xmm27 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm27 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 ; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm28 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm22[0],xmm28[1],xmm22[1],xmm28[2],xmm22[2],xmm28[3],xmm22[3],xmm28[4],xmm22[4],xmm28[5],xmm22[5],xmm28[6],xmm22[6],xmm28[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm19[0],xmm28[1],xmm19[1],xmm28[2],xmm19[2],xmm28[3],xmm19[3],xmm28[4],xmm19[4],xmm28[5],xmm19[5],xmm28[6],xmm19[6],xmm28[7],xmm19[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm21 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm23[8],xmm26[9],xmm23[9],xmm26[10],xmm23[10],xmm26[11],xmm23[11],xmm26[12],xmm23[12],xmm26[13],xmm23[13],xmm26[14],xmm23[14],xmm26[15],xmm23[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm14, %zmm14 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm28[8],xmm22[8],xmm28[9],xmm22[9],xmm28[10],xmm22[10],xmm28[11],xmm22[11],xmm28[12],xmm22[12],xmm28[13],xmm22[13],xmm28[14],xmm22[14],xmm28[15],xmm22[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm19, %xmm22 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm28[8],xmm19[8],xmm28[9],xmm19[9],xmm28[10],xmm19[10],xmm28[11],xmm19[11],xmm28[12],xmm19[12],xmm28[13],xmm19[13],xmm28[14],xmm19[14],xmm28[15],xmm19[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm22 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm23, %ymm22 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm19 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm19 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm19 {%k3} ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -6996,8 +6989,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index a724babe469c5..3c5e3adf038fa 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -214,17 +214,17 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -234,8 +234,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] @@ -301,17 +301,17 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -321,8 +321,8 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index 4c4a2fc8dcabe..fe2c41f57cfab 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -136,63 +136,61 @@ define float @test_v3f32(<3 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: maxss %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: andnps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: maxss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: andnps %xmm3, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: maxss %xmm1, %xmm2 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index 301629f033dbc..ec41657d2f248 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -729,15 +729,15 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: movaps %xmm1, %xmm5 +; SSE41-NEXT: movaps %xmm1, %xmm6 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm6 -; SSE41-NEXT: maxps %xmm5, %xmm6 +; SSE41-NEXT: movaps %xmm3, %xmm5 +; SSE41-NEXT: maxps %xmm6, %xmm5 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: cmpunordps %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: movaps %xmm4, %xmm3 ; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 @@ -749,13 +749,13 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movaps %xmm6, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movaps %xmm5, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm6, %xmm0 -; SSE41-NEXT: cmpunordps %xmm6, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movaps %xmm5, %xmm0 +; SSE41-NEXT: cmpunordps %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: testl %eax, %eax @@ -1279,15 +1279,15 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-LABEL: test_v8f64: ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: maxpd %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: maxpd %xmm6, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 @@ -1299,13 +1299,13 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax @@ -1579,15 +1579,15 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm3, %xmm10 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm10 -; SSE41-NEXT: maxpd %xmm9, %xmm10 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: maxpd %xmm10, %xmm9 ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 ; SSE41-NEXT: movapd %xmm8, %xmm7 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 @@ -1599,13 +1599,13 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 -; SSE41-NEXT: movapd %xmm10, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm3 ; SSE41-NEXT: maxpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm10, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm5 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index e622899c8de7a..5ae9e552d0dcd 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -69,63 +69,61 @@ define float @test_v2f32(<2 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: minss %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: andnps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: minss %xmm1, %xmm3 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: minss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: andnps %xmm3, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm1, %xmm3 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: minss %xmm1, %xmm2 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 0ec81c8077cd4..7f828fc293caa 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -843,7 +843,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw (%rdi), %k2 ; AVX512BW-NEXT: kandw %k4, %k2, %k3 -; AVX512BW-NEXT: kmovq %k4, %k6 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -855,16 +855,15 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-9, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovq %k3, %k4 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -917,22 +916,22 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k6 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 @@ -942,141 +941,142 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k6, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovq %k7, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k5 +; AVX512BW-NEXT: kandw %k2, %k1, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $22, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 @@ -1086,63 +1086,63 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1152,128 +1152,128 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k4, %k2 ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -1475,9 +1475,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovq %k3, %k5 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-65, %ax @@ -1490,8 +1489,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-129, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovq %k3, %k5 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF @@ -1549,47 +1549,47 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k0, %k2 -; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1598,12 +1598,12 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k6, %k1, %k1 @@ -1611,8 +1611,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1620,235 +1620,234 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $48, %k0, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k3 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $37, %k0, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 @@ -1857,113 +1856,114 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 @@ -1981,44 +1981,45 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -2026,7 +2027,6 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 @@ -2035,8 +2035,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $21, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 @@ -2047,38 +2047,37 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2096,7 +2095,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2106,8 +2106,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kandw %k5, %k1, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2115,34 +2114,34 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 @@ -2182,52 +2181,52 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -2244,58 +2243,58 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -3588,10 +3587,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd (%rdi), %k5 ; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k6 ; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kandw %k6, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -3639,9 +3637,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF @@ -3653,9 +3651,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF @@ -3671,9 +3669,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -3688,8 +3686,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 ; AVX512BW-NEXT: kshiftrd $28, %k5, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k3 +; AVX512BW-NEXT: kandw %k6, %k1, %k3 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload @@ -3722,15 +3720,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 @@ -3743,77 +3741,77 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $25, %k5, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 +; AVX512BW-NEXT: kshiftrd $25, %k5, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k6, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $26, %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $27, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -3830,7 +3828,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrd $23, %k5, %k7 ; AVX512BW-NEXT: kmovq %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 @@ -3852,8 +3851,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -3862,10 +3860,11 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 @@ -3878,77 +3877,76 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k7, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -3971,16 +3969,16 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 @@ -4024,53 +4022,53 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k3, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -4079,17 +4077,17 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $9, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 +; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 @@ -4105,7 +4103,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4126,16 +4125,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 @@ -4144,63 +4142,63 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k2} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 +; AVX512BW-NEXT: korw %k3, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4208,69 +4206,69 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 +; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -4309,41 +4307,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4351,22 +4349,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 @@ -4375,33 +4373,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1152(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4412,41 +4410,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4454,22 +4452,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 @@ -4478,33 +4476,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1152(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4640,8 +4638,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4650,12 +4648,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4699,11 +4697,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 @@ -4716,12 +4715,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4768,29 +4766,29 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4832,15 +4830,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4849,16 +4847,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4896,32 +4893,32 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4930,8 +4927,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4948,20 +4945,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4970,22 +4966,23 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -4994,11 +4991,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5007,20 +5005,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5037,12 +5035,10 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5051,57 +5047,59 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5110,20 +5108,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 @@ -5132,37 +5129,36 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5171,16 +5167,16 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5196,11 +5192,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5209,20 +5206,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5231,16 +5227,16 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -5261,53 +5257,53 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5316,8 +5312,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5334,16 +5330,14 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5352,18 +5346,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5372,14 +5368,14 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -5389,24 +5385,23 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 @@ -5415,16 +5410,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5433,36 +5427,38 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5475,33 +5471,31 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5514,36 +5508,38 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5552,20 +5548,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5578,12 +5572,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5592,10 +5585,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5603,28 +5597,30 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5645,14 +5641,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -5663,7 +5658,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5680,21 +5676,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5715,49 +5710,49 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5770,19 +5765,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -5792,23 +5786,25 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5816,8 +5812,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -6389,15 +6384,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: movw $-17, %ax +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -6455,16 +6450,16 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -6475,37 +6470,37 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $30, %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -6517,12 +6512,12 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -6537,79 +6532,79 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 ; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kmovq %k5, %k1 -; AVX512BW-NEXT: kshiftrd $26, %k5, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $26, %k5, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $27, %k1, %k7 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 +; AVX512BW-NEXT: kmovq %k5, %k2 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $28, %k1, %k6 -; AVX512BW-NEXT: kmovq %k1, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrd $28, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $24, %k4, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 @@ -6617,16 +6612,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -6635,145 +6629,145 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k5, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $2, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k5, %k2, %k1 +; AVX512BW-NEXT: korw %k4, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovq %k0, %k1 -; AVX512BW-NEXT: kshiftrd $21, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $21, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 ; AVX512BW-NEXT: kmovq %k1, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k4 ; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: korw %k3, %k4, %k3 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} ; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -6790,104 +6784,107 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: korw %k0, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: kmovq %k4, %k0 ; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $13, %k4, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $13, %k0, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 ; AVX512BW-NEXT: kmovq %k0, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 @@ -6897,68 +6894,67 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k3 ; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $10, %k7, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kmovq %k7, %k2 +; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k1, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -6973,56 +6969,55 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k5, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k4, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftrd $8, %k2, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 @@ -7052,7 +7047,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -7064,46 +7060,45 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 ; AVX512BW-NEXT: kmovq %k2, %k5 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $5, %k1, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k6, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kandw %k0, %k3, %k3 @@ -7131,8 +7126,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 @@ -7140,34 +7135,33 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k4, %k3 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -7231,63 +7225,63 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 @@ -7296,8 +7290,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 @@ -7308,42 +7302,42 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -7351,63 +7345,63 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 @@ -7416,8 +7410,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 @@ -7428,42 +7422,42 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7570,6 +7564,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k5, %k3 ; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 @@ -7578,27 +7573,27 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $4, %k3, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -7606,8 +7601,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -7804,8 +7799,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -7865,11 +7860,10 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovq %k4, %k3 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -7911,15 +7905,16 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -7933,7 +7928,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 +; AVX512BW-NEXT: kmovq %k7, %k4 +; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -7955,7 +7951,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $19, %k4, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -7963,8 +7960,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -7981,7 +7978,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8005,7 +8002,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8024,10 +8022,10 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovq %k3, %k5 +; AVX512BW-NEXT: kshiftrq $22, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -8056,26 +8054,27 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8233,8 +8232,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -8246,28 +8245,28 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $32, %k2, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8291,87 +8290,89 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $33, %k2, %k1 +; AVX512BW-NEXT: kmovq %k2, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq %k7, %k5 +; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8382,12 +8383,12 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8395,62 +8396,64 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $39, %k7, %k6 +; AVX512BW-NEXT: kmovq %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -8463,99 +8466,102 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $42, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $43, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8576,7 +8582,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8584,39 +8590,39 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8633,43 +8639,44 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8677,12 +8684,12 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8693,7 +8700,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 +; AVX512BW-NEXT: kmovq %k7, %k5 +; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -8716,31 +8724,32 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8752,19 +8761,20 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8780,54 +8790,53 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $54, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k5, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -8835,6 +8844,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -8845,44 +8855,44 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 @@ -8912,18 +8922,20 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8932,8 +8944,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -8944,12 +8955,11 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8970,8 +8980,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8980,48 +8989,49 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) @@ -9685,7 +9695,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kmovq %k2, %k3 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 @@ -9694,7 +9704,6 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -9725,8 +9734,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovd (%rdi), %k3 -; AVX512BW-NEXT: kshiftrd $1, %k3, %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k6 +; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -9771,7 +9780,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $2, %k3, %k2 +; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -9781,357 +9790,362 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k3, %k1 +; AVX512BW-NEXT: kmovq %k6, %k2 +; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k0 +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $30, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $31, %k3, %k4 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k4, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k4, %k6 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 +; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k3 +; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $27, %k3, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k3, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k1 +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $27, %k4, %k1 +; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 +; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k3, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k7 +; AVX512BW-NEXT: kshiftrd $28, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: korw %k7, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 -; AVX512BW-NEXT: korw %k0, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k5 -; AVX512BW-NEXT: kshiftrd $26, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k2 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $23, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kshiftrd $22, %k2, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $24, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $20, %k6, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $21, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 +; AVX512BW-NEXT: kandw %k0, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k5 +; AVX512BW-NEXT: kandw %k0, %k2, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k1, %k4, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k7, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $18, %k2, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: kmovq %k1, %k2 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k7, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $19, %k2, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10139,99 +10153,98 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 +; AVX512BW-NEXT: korw %k0, %k3, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k3 ; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: korw %k0, %k3, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload @@ -10245,88 +10258,90 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k6, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k3 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 ; AVX512BW-NEXT: kmovq %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k5 +; AVX512BW-NEXT: kshiftrd $15, %k6, %k3 +; AVX512BW-NEXT: kmovq %k6, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $11, %k2, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovq %k0, %k3 +; AVX512BW-NEXT: kshiftrd $11, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $12, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10334,301 +10349,301 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 -; AVX512BW-NEXT: kmovq %k4, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k0, %k4, %k4 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k4} {z} +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k0 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftrd $10, %k6, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $10, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $7, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $6, %k3, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kshiftrd $6, %k4, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k7, %k2, %k4 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k4 -; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k4, %k4 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $4, %k6, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $4, %k6, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kandw %k0, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftrd $5, %k6, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $3, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 @@ -10661,274 +10676,274 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -10944,8 +10959,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovq %k2, %k3 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -10962,9 +10978,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-65, %ax @@ -10977,8 +10992,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq (%rdi), %k3 -; AVX512BW-NEXT: kshiftrq $1, %k3, %k0 +; AVX512BW-NEXT: kmovq (%rdi), %k4 +; AVX512BW-NEXT: kshiftrq $1, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -10995,9 +11010,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF @@ -11013,16 +11028,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k3, %k1 +; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11034,22 +11049,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k3, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11057,24 +11072,23 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11083,11 +11097,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11097,108 +11112,109 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 +; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k0, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $7, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $8, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -11206,18 +11222,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $9, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} @@ -11229,11 +11244,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11242,20 +11257,20 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $10, %k2, %k0 +; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -11265,25 +11280,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $11, %k5, %k6 +; AVX512BW-NEXT: kmovq %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -11294,10 +11310,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $12, %k2, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11313,42 +11328,43 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $13, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $13, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload @@ -11375,34 +11391,34 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 @@ -11451,10 +11467,10 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11471,8 +11487,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11483,8 +11499,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11523,19 +11539,19 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -11548,15 +11564,15 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11573,22 +11589,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -11596,9 +11612,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k2, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $23, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11610,8 +11626,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11622,34 +11638,32 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $24, %k2, %k0 +; AVX512BW-NEXT: kshiftrq $24, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11658,12 +11672,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $25, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11673,7 +11687,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -11691,34 +11706,33 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 -; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11729,18 +11743,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $28, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $28, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11751,50 +11765,50 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $29, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $29, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k7, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $30, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11802,54 +11816,54 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -11859,19 +11873,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 @@ -11882,33 +11895,33 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kmovq %k7, %k3 +; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11933,12 +11946,13 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 +; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11949,29 +11963,29 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -12007,31 +12021,32 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kandw %k5, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 +; AVX512BW-NEXT: kmovq %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -12039,10 +12054,10 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $39, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12057,91 +12072,92 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k3, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $41, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kandw %k5, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k3, %k0 +; AVX512BW-NEXT: kshiftrq $42, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12149,8 +12165,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -12166,15 +12182,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12200,44 +12217,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovq %k4, %k5 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k7, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $46, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12260,38 +12275,38 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 @@ -12302,42 +12317,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 ; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -12350,7 +12365,6 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -12362,12 +12376,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -12376,60 +12389,61 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $51, %k2, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $51, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k2, %k6 +; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -12442,37 +12456,37 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -12483,21 +12497,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k5, %k1, %k1 @@ -12507,43 +12522,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k2, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $57, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} @@ -12555,24 +12569,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k2, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $58, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12583,25 +12599,24 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $59, %k2, %k6 +; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -12623,15 +12638,14 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k5 -; AVX512BW-NEXT: kshiftrq $60, %k2, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $60, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12643,39 +12657,39 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $61, %k5, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 @@ -12702,7 +12716,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 @@ -12711,14 +12726,13 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -13119,26 +13133,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13160,10 +13174,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 @@ -13176,17 +13190,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13200,26 +13214,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor8_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13241,10 +13255,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 @@ -13257,17 +13271,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13351,303 +13365,319 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64: ; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: subq $136, %rsp ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm16 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm28 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm29 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm30 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm15, %zmm31 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm17, %zmm2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1920(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1856(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1792(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512F-ONLY-NEXT: addq $136, %rsp ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor8_vf64: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: subq $136, %rsp ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm29 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm30 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm15, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 +; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 +; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 ; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1920(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1856(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1792(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512DQ-NEXT: addq $136, %rsp ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -13661,75 +13691,75 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm17 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm2, %k2 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm9, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm15, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm14, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm16, %k1 +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm16 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm17, %k2 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm17 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm12, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 +; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm7, %k2 -; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k1} {z} -; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} @@ -13748,23 +13778,23 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll index 4e90e4c5fa4da..9294ae48a7695 100644 --- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -169,70 +169,70 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm13 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm14 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm15 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm10 -; AVX1-NEXT: vpmovsxbd %xmm10, %xmm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm10, %xmm10 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11 -; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-NEXT: # xmm15 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm0 -; AVX1-NEXT: vpslld %xmm15, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm12 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm10 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm13, %xmm13 +; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9 +; AVX1-NEXT: vpslld %xmm0, %xmm9, %xmm1 ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm13, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm11, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm13 -; AVX1-NEXT: vpslld %xmm15, %xmm13, %xmm15 -; AVX1-NEXT: vpslld %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpmovsxbd %xmm12, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm12, %xmm12 -; AVX1-NEXT: vblendvps %xmm9, %xmm15, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm9 +; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm9 +; AVX1-NEXT: vblendvps %xmm15, %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm13, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm13, %xmm13 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm14, %xmm14 +; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm15 +; AVX1-NEXT: vpslld %xmm0, %xmm15, %xmm0 +; AVX1-NEXT: vpslld %xmm2, %xmm15, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm14, %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm3, %xmm9, %xmm15 -; AVX1-NEXT: vpslld %xmm4, %xmm9, %xmm9 -; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm9, %xmm9 -; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm10, %xmm15, %xmm14, %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1 -; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 -; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13 -; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm12, %xmm15, %xmm14, %xmm12 -; AVX1-NEXT: vmovups %xmm0, (%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm2, 16(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm9, 32(%rdi,%rcx,4) +; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm12 +; AVX1-NEXT: vpslld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vblendvps %xmm11, %xmm12, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm3, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm4, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm10, %xmm12, %xmm11, %xmm10 +; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm1, %xmm12, %xmm11, %xmm1 +; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm13, %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm12 +; AVX1-NEXT: vpslld %xmm7, %xmm12, %xmm13 +; AVX1-NEXT: vpslld %xmm8, %xmm12, %xmm12 +; AVX1-NEXT: vblendvps %xmm15, %xmm13, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm7, %xmm13, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm13, %xmm13 +; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm13, %xmm13 +; AVX1-NEXT: vmovups %xmm9, (%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm0, 16(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm2, 32(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm10, 48(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm1, 64(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm12, 112(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm12, 96(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm13, 112(%rdi,%rcx,4) ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rdx ; AVX1-NEXT: jne .LBB0_4 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 727b3ff2eb45c..3ca0e2121e0d1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -431,7 +431,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -443,7 +443,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: andl $15, %ecx ; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE2-NEXT: movd %eax, %xmm9 @@ -473,10 +473,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -484,7 +484,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -515,7 +515,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -527,7 +527,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $15, %ecx ; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -557,10 +557,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -568,7 +568,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -850,7 +850,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: andl $15, %r12d ; SSE2-NEXT: movzbl -24(%rsp,%r12), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: andl $15, %r15d ; SSE2-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE2-NEXT: movd %eax, %xmm7 @@ -859,7 +859,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: andl $15, %ebx ; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: andl $15, %r11d ; SSE2-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm9 @@ -868,10 +868,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: andl $15, %r9d ; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax -; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: andl $15, %r8d ; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm13 @@ -888,18 +888,18 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -948,7 +948,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: andl $15, %r12d ; SSSE3-NEXT: movzbl -24(%rsp,%r12), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $15, %r15d ; SSSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSSE3-NEXT: movd %eax, %xmm7 @@ -957,7 +957,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: andl $15, %ebx ; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $15, %r11d ; SSSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -966,10 +966,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $15, %r9d ; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $15, %r8d ; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $15, %esi ; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -986,18 +986,18 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index fbf9187df4817..2f3fdeb74dc47 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4963,15 +4963,15 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: vpextrd $1, %xmm1, %r9d ; SKX-NEXT: movw %r9w, 27(%rdi) ; SKX-NEXT: vmovd %xmm1, %r8d -; SKX-NEXT: vpextrd $3, %xmm0, %esi +; SKX-NEXT: vpextrd $3, %xmm0, %edx ; SKX-NEXT: movw %r8w, 24(%rdi) -; SKX-NEXT: movw %si, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %edx -; SKX-NEXT: vpextrd $1, %xmm0, %ecx -; SKX-NEXT: movw %dx, 6(%rdi) -; SKX-NEXT: movw %cx, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %eax -; SKX-NEXT: movw %ax, (%rdi) +; SKX-NEXT: movw %dx, 9(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %esi +; SKX-NEXT: vpextrd $1, %xmm0, %eax +; SKX-NEXT: movw %si, 6(%rdi) +; SKX-NEXT: movw %ax, 3(%rdi) +; SKX-NEXT: vmovd %xmm0, %ecx +; SKX-NEXT: movw %cx, (%rdi) ; SKX-NEXT: shrl $16, %r15d ; SKX-NEXT: movb %r15b, 47(%rdi) ; SKX-NEXT: shrl $16, %r14d @@ -4997,14 +4997,14 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: movw %r9w, 15(%rdi) ; SKX-NEXT: vmovd %xmm0, %r8d ; SKX-NEXT: movw %r8w, 12(%rdi) -; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 11(%rdi) ; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 8(%rdi) -; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 5(%rdi) +; SKX-NEXT: movb %dl, 11(%rdi) +; SKX-NEXT: shrl $16, %esi +; SKX-NEXT: movb %sil, 8(%rdi) ; SKX-NEXT: shrl $16, %eax -; SKX-NEXT: movb %al, 2(%rdi) +; SKX-NEXT: movb %al, 5(%rdi) +; SKX-NEXT: shrl $16, %ecx +; SKX-NEXT: movb %cl, 2(%rdi) ; SKX-NEXT: shrl $16, %r11d ; SKX-NEXT: movb %r11b, 23(%rdi) ; SKX-NEXT: shrl $16, %r10d diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 1719e2588db9e..e717ede16a9f0 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -849,8 +849,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -862,18 +862,18 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: packusdw %xmm7, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16: diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll index c785db8879d49..a2affbd8728c2 100644 --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -39,10 +39,10 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %ecx +; X86-NEXT: kmovw %k0, %edx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %edx +; X86-NEXT: kmovw %k0, %ecx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X86-NEXT: kmovw %k0, %edi @@ -50,11 +50,11 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload ; X86-NEXT: kmovw %k2, %edi -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: kmovw %k1, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: kmovw %k1, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: addl %edx, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movw %ax, (%esi) ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 132a6beca8e95..d6716d0edff40 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1316,24 +1316,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $72, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl (%edx), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE2-NEXT: movl (%edi), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%edx), %ecx +; X86-SSE2-NEXT: movl 4(%edi), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%edx), %edi -; X86-SSE2-NEXT: movl 12(%edx), %ebx -; X86-SSE2-NEXT: movl 16(%edx), %ebp +; X86-SSE2-NEXT: movl 8(%edi), %esi +; X86-SSE2-NEXT: movl 12(%edi), %ebx +; X86-SSE2-NEXT: movl 16(%edi), %ebp ; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movl 20(%edx), %esi -; X86-SSE2-NEXT: movl 24(%edx), %ecx -; X86-SSE2-NEXT: movl 28(%edx), %edx -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl 20(%edi), %edx +; X86-SSE2-NEXT: movl 24(%edi), %ecx +; X86-SSE2-NEXT: movl 28(%edi), %edi +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1348,20 +1348,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andb $31, %al ; X86-SSE2-NEXT: negb %al -; X86-SSE2-NEXT: movsbl %al, %eax -; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 48(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 56(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 68(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx +; X86-SSE2-NEXT: movsbl %al, %edx +; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi +; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi +; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx +; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp +; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx +; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 03cea0e0de6bf..24475360cbbc4 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -658,15 +658,15 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -676,27 +676,27 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -732,29 +732,29 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -788,26 +788,26 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -974,15 +974,15 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -993,27 +993,27 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1049,21 +1049,21 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax @@ -1071,7 +1071,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1106,27 +1106,27 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%esi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1289,46 +1289,46 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1365,29 +1365,29 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1422,26 +1422,26 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1642,24 +1642,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -1775,24 +1775,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1822,26 +1822,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1849,20 +1849,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1882,30 +1882,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1914,66 +1914,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1987,30 +1989,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2022,55 +2024,57 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2201,29 +2205,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes: @@ -2277,24 +2281,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -2315,7 +2319,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi @@ -2326,19 +2330,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx @@ -2348,46 +2350,46 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) @@ -2412,24 +2414,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2447,29 +2449,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -2521,30 +2523,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2555,66 +2557,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) @@ -2633,26 +2633,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2670,56 +2670,58 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2921,40 +2923,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch @@ -3057,40 +3057,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl @@ -3108,26 +3106,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3135,20 +3133,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3167,103 +3165,104 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 28(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 24(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -3277,91 +3276,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -3535,10 +3536,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -3553,10 +3554,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 @@ -3571,9 +3572,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -3621,54 +3622,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3715,42 +3716,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r13, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -4184,15 +4185,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4207,13 +4208,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -4228,7 +4229,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) @@ -4276,20 +4277,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -4311,10 +4312,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -4331,146 +4330,146 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 76(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%ebx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 40(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -4608,19 +4607,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ebp, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx @@ -4634,13 +4633,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4651,12 +4650,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4723,18 +4722,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -4786,15 +4785,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -4942,55 +4941,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %r15, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -5039,45 +5038,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r13, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %rbp, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r14, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -5430,11 +5429,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5451,12 +5450,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx @@ -5466,67 +5465,67 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx @@ -5544,18 +5543,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5567,7 +5566,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) @@ -5595,41 +5594,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5642,8 +5641,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5652,12 +5649,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5674,106 +5672,106 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 212(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi @@ -5803,9 +5801,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) @@ -5840,13 +5838,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax @@ -5856,10 +5855,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx @@ -5870,8 +5868,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5883,13 +5882,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax @@ -5916,64 +5914,64 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -5984,32 +5982,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) @@ -6017,7 +6015,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -6209,10 +6207,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -6227,10 +6225,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 @@ -6245,9 +6243,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -6296,54 +6294,54 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6391,42 +6389,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r13, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -6862,15 +6860,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -6885,13 +6883,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -6906,7 +6904,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) @@ -6954,23 +6952,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -7008,149 +7004,148 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 76(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 40(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -7164,7 +7159,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -7241,119 +7236,117 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 7bd110748d55b..aaba44c8dc111 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1072,109 +1072,109 @@ ret void define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rax +; AVX1-NEXT: subq $24, %rsp ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm4, %ymm5 +; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovdqa %ymm2, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm7 -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm13 -; AVX1-NEXT: vpor %xmm7, %xmm13, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm13 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm14 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX1-NEXT: vpor %xmm13, %xmm15, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm10 -; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 +; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm2, %xmm12, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX1-NEXT: # xmm8 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX1-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm7 -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10 +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12 +; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 +; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 +; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi) +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10 +; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6 +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm6, 176(%rdi) +; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi) ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) ; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 128(%rdi) -; AVX1-NEXT: popq %rax +; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) +; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1273,116 +1273,116 @@ ret void define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm9 -; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %xmm11 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 +; AVX1-NEXT: vmovups 64(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 -; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6 +; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12 -; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 -; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm8 -; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 +; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm6, %xmm15, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11 ; AVX1-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12 +; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm9 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10 -; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm9 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm9 -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm12 -; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12 -; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm12, %xmm15 -; AVX1-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX1-NEXT: vmovdqu 128(%rdi), %xmm15 -; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13 +; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13 +; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10 +; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13 +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12 +; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3 +; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12 +; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14 +; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13 -; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14 -; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14 -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12 +; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12 +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15 +; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15 +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm11[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX1-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] ; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: @@ -1489,64 +1489,64 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-LABEL: interleaved_store_vf64_i8_stride4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm11 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) ; AVX1-NEXT: vmovaps %ymm1, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) ; AVX1-NEXT: vmovaps %ymm2, 96(%rdi) ; AVX1-NEXT: vmovaps %ymm9, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm6, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 2517530ca28ec..6eb34b4e773e8 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -222,10 +222,10 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %eax, %ebp -; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi @@ -235,9 +235,9 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ebp, %esi +; WIN32-NEXT: adcl %ecx, %esi ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx @@ -570,14 +570,14 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl %eax, %ecx @@ -586,36 +586,35 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: addl %edi, %ebx ; WIN32-NEXT: addl %eax, %ebx ; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: adcl $0, %ecx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: adcl %ecx, %ebp +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload @@ -628,9 +627,9 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: jne LBB12_2 ; WIN32-NEXT: # %bb.1: ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: LBB12_2: -; WIN32-NEXT: movl %edi, %edx +; WIN32-NEXT: movl %ebp, %edx ; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi @@ -1001,42 +1000,42 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %eax, %ebx -; WIN32-NEXT: addl %esi, %ebx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ebx, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl $0, %ebp -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: adcl $0, %edi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: adcl %edi, %ebp ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload @@ -1696,23 +1695,23 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $16, %esp +; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %edx -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl 4(%eax), %esi -; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill +; WIN32-NEXT: movl (%eax), %ebx +; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %ebp ; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull %esi, %ecx -; WIN32-NEXT: mull %edx -; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: imull %ebp, %ecx +; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi @@ -1721,35 +1720,36 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: addl %eax, %edi ; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: addl %ebp, %ebx -; WIN32-NEXT: addl %eax, %ebp -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload +; WIN32-NEXT: addl %ecx, %ebx +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; WIN32-NEXT: adcl %ebx, %edi -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebp, %ecx +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; WIN32-NEXT: adcl $0, %ebx -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: adcl %ebx, %esi -; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: adcl %ebx, %ecx +; WIN32-NEXT: setb %bl +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: movzbl %cl, %ecx +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movzbl %bl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %edi, %edx ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx @@ -1761,7 +1761,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $16, %esp +; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1805,44 +1805,44 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $16, %esp +; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %ebp -; WIN32-NEXT: movl 4(%eax), %ebx -; WIN32-NEXT: movl %ebx, (%esp) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %eax ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; WIN32-NEXT: addl %ebx, %ecx ; WIN32-NEXT: adcl $0, %edi -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ebp @@ -1866,7 +1866,7 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $16, %esp +; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -2221,30 +2221,29 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %esi +; WIN32-NEXT: movl (%eax), %ebp ; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %ebx, %ebx +; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %ch -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi ; WIN32-NEXT: seto %bl -; WIN32-NEXT: orb %ch, %bl -; WIN32-NEXT: orb %cl, %bl -; WIN32-NEXT: leal (%edi,%eax), %ecx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %ecx, %edx +; WIN32-NEXT: seto %ch +; WIN32-NEXT: orb %bl, %ch +; WIN32-NEXT: orb %cl, %ch +; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %bl, %cl +; WIN32-NEXT: orb %ch, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) @@ -2300,9 +2299,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl (%edx), %ebp -; WIN32-NEXT: movl 4(%edx), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl (%ecx), %ebp +; WIN32-NEXT: movl 4(%ecx), %esi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %esi, %esi diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll index 1c58b90f77dea..080b3dd75ee9a 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -47,17 +47,17 @@ define void @test2(ptr %struct, i32 %n) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cbz x0, .LBB1_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader -; CHECK-NEXT: mov w9, #40000 // =0x9c40 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x0, x9 -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: mov w8, #40000 // =0x9c40 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB1_2 ; CHECK-NEXT: .LBB1_3: // %while_end ; CHECK-NEXT: ret @@ -86,20 +86,20 @@ define void @test3(ptr %s1, ptr %s2, i1 %cond, i32 %n) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: csel x9, x1, x0, ne -; CHECK-NEXT: cbz x9, .LBB2_3 +; CHECK-NEXT: csel x8, x1, x0, ne +; CHECK-NEXT: cbz x8, .LBB2_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader ; CHECK-NEXT: mov w10, #40000 // =0x9c40 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: cmp w8, w3 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: .LBB2_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w3 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.lt .LBB2_2 ; CHECK-NEXT: .LBB2_3: // %while_end ; CHECK-NEXT: ret @@ -150,8 +150,8 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_remember_state ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: mov w20, wzr -; CHECK-NEXT: mov w21, #40000 // =0x9c40 +; CHECK-NEXT: mov w21, wzr +; CHECK-NEXT: mov w20, #40000 // =0x9c40 ; CHECK-NEXT: .LBB3_1: // %while_cond ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: .Ltmp0: @@ -159,15 +159,15 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: // %bb.2: // %while_cond_x.split ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: add x8, x0, x21 -; CHECK-NEXT: cmp w20, w19 +; CHECK-NEXT: add x8, x0, x20 +; CHECK-NEXT: cmp w21, w19 ; CHECK-NEXT: str wzr, [x8] ; CHECK-NEXT: b.ge .LBB3_4 ; CHECK-NEXT: // %bb.3: // %while_body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: str w20, [x8, #4] -; CHECK-NEXT: add w20, w20, #1 -; CHECK-NEXT: str w20, [x8] +; CHECK-NEXT: str w21, [x8, #4] +; CHECK-NEXT: add w21, w21, #1 +; CHECK-NEXT: str w21, [x8] ; CHECK-NEXT: b .LBB3_1 ; CHECK-NEXT: .LBB3_4: // %while_end ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -220,18 +220,18 @@ declare i32 @__FrameHandler(...) define void @test5(ptr %s, i32 %n) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, #19, lsl #12 // =77824 -; CHECK-NEXT: add x9, x9, #2176 -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x8, #19, lsl #12 // =77824 +; CHECK-NEXT: add x8, x8, #2176 +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB4_1 ; CHECK-NEXT: .LBB4_2: // %while_end ; CHECK-NEXT: ret